# Set working directory and path to data
setwd("C:/Users/LENOVO/Downloads/Regression Model") # Example path on Windows
# Clear the workspace
rm(list = ls()) # Clear environment
gc() # Clear unused memory
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 523717 28.0 1166141 62.3 660385 35.3
## Vcells 957762 7.4 8388608 64.0 1769879 13.6
cat("\f") # Clear the console
dev.off # Clear the charts
## function (which = dev.cur())
## {
## if (which == 1)
## stop("cannot shut down device 1 (the null device)")
## .External(C_devoff, as.integer(which))
## dev.cur()
## }
## <bytecode: 0x000002cabf568ec0>
## <environment: namespace:grDevices>
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df <- read.csv("C:\\Users\\LENOVO\\Downloads\\Regression Model\\Project\\sportsref_download.csv")
df1 <- read.csv("C:\\Users\\LENOVO\\Downloads\\sportsref_download.csv")
I am going to modify the dataset with mutate function and use gsub(“\“,”“, .): This replaces any asterisks (*) with an empty string in all columns. This is typically done to clean data where asterisks might be used to denote missing values or special cases.
df <- df %>%
mutate(across(everything(), ~ gsub("\\*", "", .)))
df1 <- df1 %>%
mutate(across(everything(), ~ gsub("\\*", "", .)))
names(df)
## [1] "Rk" "Team" "G" "MP" "FG" "FGA" "FG." "X3P" "X3PA" "X3P."
## [11] "X2P" "X2PA" "X2P." "FT" "FTA" "FT." "ORB" "DRB" "TRB" "AST"
## [21] "STL" "BLK" "TOV" "PF" "PTS"
names(df1)
## [1] "Rk" "Team" "Age"
## [4] "W" "L" "PW"
## [7] "PL" "MOV" "SOS"
## [10] "SRS" "ORtg" "DRtg"
## [13] "NRtg" "Pace" "FTr"
## [16] "X3PAr" "TS." "X"
## [19] "Offense.Four.Factors" "X.1" "X.2"
## [22] "X.3" "X.4" "Defense.Four.Factors"
## [25] "X.5" "X.6" "X.7"
## [28] "X.8" "X.9" "X.10"
## [31] "X.11"
Here’s a breakdown of the variables in your NBA stats dataset, including their definitions:
These definitions cover the main statistical categories typically recorded in an NBA dataset, reflecting various aspects of player and team performance.
# Remove column 'b'
df_new <- subset(df1, select = -c(Age, Rk, PW, PL, MOV, SOS, SRS, ORtg, DRtg, NRtg, FTr, X3PAr, TS., X, Offense.Four.Factors, X.1, X.2, X.3, X.4, Defense.Four.Factors, X.5, X.6, X.7, X.8, X.9, X.10, X.11))
df_new
## Team W L Pace
## 1 Milwaukee Bucks 56 17 105.1
## 2 Boston Celtics 48 24 99.5
## 3 Los Angeles Clippers 49 23 101.5
## 4 Toronto Raptors 53 19 100.9
## 5 Los Angeles Lakers 52 19 100.9
## 6 Dallas Mavericks 43 32 99.3
## 7 Miami Heat 44 29 98.3
## 8 Houston Rockets 44 28 103.7
## 9 Utah Jazz 44 28 98.6
## 10 Philadelphia 76ers 43 30 99
## 11 Denver Nuggets 46 27 97.1
## 12 Indiana Pacers 45 28 98.9
## 13 Oklahoma City Thunder 44 28 98.8
## 14 Phoenix Suns 34 39 101.3
## 15 Brooklyn Nets 35 37 101.4
## 16 Orlando Magic 33 40 98.6
## 17 San Antonio Spurs 32 39 100.5
## 18 Memphis Grizzlies 34 39 102.8
## 19 Portland Trail Blazers 35 39 100.7
## 20 New Orleans Pelicans 30 42 103.7
## 21 Sacramento Kings 31 41 98.9
## 22 Chicago Bulls 22 43 99.7
## 23 Detroit Pistons 20 46 97.6
## 24 Minnesota Timberwolves 19 45 103.4
## 25 Washington Wizards 25 47 102.7
## 26 New York Knicks 21 45 98.6
## 27 Charlotte Hornets 23 42 95.8
## 28 Atlanta Hawks 20 47 103
## 29 Cleveland Cavaliers 19 46 98.7
## 30 Golden State Warriors 15 50 100.3
## 31 <NA> <NA> <NA>
merged_df <- merge(df, df_new, by = "Team")
names(merged_df)
## [1] "Team" "Rk" "G" "MP" "FG" "FGA" "FG." "X3P" "X3PA" "X3P."
## [11] "X2P" "X2PA" "X2P." "FT" "FTA" "FT." "ORB" "DRB" "TRB" "AST"
## [21] "STL" "BLK" "TOV" "PF" "PTS" "W" "L" "Pace"
df_2019 <- subset(merged_df, select = -c(Rk, X2P, X2PA, X2P., FT, FTA, FT., ORB, DRB, TRB, AST, STL, BLK, TOV, PF))
df_2019 <- df_2019 %>% mutate(Year = 2019)
names(df_2019)
## [1] "Team" "G" "MP" "FG" "FGA" "FG." "X3P" "X3PA" "X3P." "PTS"
## [11] "W" "L" "Pace" "Year"
names(df_2019) <- c("Team", "Games", "MinutesPlayed", "FieldGoal", "FieldGoalsAttempted", "FieldGoalsPercentage", "3Point", "3PointAttempted", "3PointPercentage", "Points", "Wins", "Losses", "Pace", "Year")
df_2019 <- df_2019 %>%
relocate(Wins, Losses, .before = MinutesPlayed) %>%
relocate(Year, .before = Games) %>% relocate(Pace, .before = Points)
df21 <- read.csv("C:\\Users\\LENOVO\\Downloads\\Regression Model\\Project\\sportsref_download (1).csv")
df2 <- read.csv("C:\\Users\\LENOVO\\Downloads\\2020-21.csv")
df21 <- df21 %>%
mutate(across(everything(), ~ gsub("\\*", "", .)))
df2 <- df2 %>%
mutate(across(everything(), ~ gsub("\\*", "", .)))
df_new1 <- subset(df2, select = -c(Age, Rk, PW, PL, MOV, SOS, SRS, ORtg, DRtg, NRtg, FTr, X3PAr, TS., X, Offense.Four.Factors, X.1, X.2, X.3, X.4, Defense.Four.Factors, X.5, X.6, X.7, X.8, X.9, X.10, X.11))
df_new1
## Team W L Pace
## 1 Utah Jazz 52 20 98.5
## 2 Los Angeles Clippers 47 25 96.9
## 3 Phoenix Suns 51 21 97.2
## 4 Milwaukee Bucks 46 26 102.2
## 5 Philadelphia 76ers 49 23 99.5
## 6 Denver Nuggets 47 25 97.1
## 7 Brooklyn Nets 48 24 99.5
## 8 Los Angeles Lakers 42 30 98.7
## 9 Dallas Mavericks 42 30 97.3
## 10 New York Knicks 41 31 95.9
## 11 Atlanta Hawks 41 31 97.6
## 12 Portland Trail Blazers 42 30 98.4
## 13 Boston Celtics 36 36 98.3
## 14 Golden State Warriors 39 33 102.2
## 15 Memphis Grizzlies 38 34 100.4
## 16 Miami Heat 40 32 96.6
## 17 Indiana Pacers 34 38 101.6
## 18 New Orleans Pelicans 31 41 100.1
## 19 Toronto Raptors 27 45 99.2
## 20 Chicago Bulls 31 41 99
## 21 San Antonio Spurs 33 39 98.9
## 22 Washington Wizards 34 38 104.1
## 23 Charlotte Hornets 33 39 98.3
## 24 Sacramento Kings 31 41 100
## 25 Detroit Pistons 20 52 97.9
## 26 Minnesota Timberwolves 23 49 101.6
## 27 Houston Rockets 17 55 101.4
## 28 Cleveland Cavaliers 22 50 97.3
## 29 Orlando Magic 21 51 98.7
## 30 Oklahoma City Thunder 22 50 101
merged_df1 <- merge(df21, df_new1, by = "Team")
df_2020 <- subset(merged_df, select = -c(Rk, X2P, X2PA, X2P., FT, FTA, FT., ORB, DRB, TRB, AST, STL, BLK, TOV, PF))
df_2020 <- df_2020 %>% mutate(Year = 2020)
names(df_2020) <- c("Team", "Games", "MinutesPlayed", "FieldGoal", "FieldGoalsAttempted", "FieldGoalsPercentage", "3Point", "3PointAttempted", "3PointPercentage", "Points", "Wins", "Losses", "Pace", "Year")
df_2020 <- df_2020 %>%
relocate(Wins, Losses, .before = MinutesPlayed) %>%
relocate(Year, .before = Games) %>% relocate(Pace, .before = Points)
# Combine datasets
combined_data <- bind_rows(df_2019, df_2020)
df31 <- read.csv("C:\\Users\\LENOVO\\Downloads\\Regression Model\\Project\\sportsref_download (2).csv")
df3 <- read.csv("C:\\Users\\LENOVO\\Downloads\\2021-22.csv")
df31 <- df31 %>%
mutate(across(everything(), ~ gsub("\\*", "", .)))
df3 <- df3 %>%
mutate(across(everything(), ~ gsub("\\*", "", .)))
df_new2 <- subset(df3, select = -c(Age, Rk, PW, PL, MOV, SOS, SRS, ORtg, DRtg, NRtg, FTr, X3PAr, TS., X, eFG., TOV., ORB., FT.FGA, X.1, X.2, eFG..1, TOV..1, DRB., FT.FGA.1, Arena, Attend., Attend..G))
merged_df2 <- merge(df31, df_new2, by = "Team")
df_2021 <- subset(merged_df2, select = -c(Rk, X2P, X2PA, X2P., FT, FTA, FT., ORB, DRB, TRB, AST, STL, BLK, TOV, PF))
df_2021 <- df_2021 %>% mutate(Year = 2021)
names(df_2021) <- c("Team", "Games", "MinutesPlayed", "FieldGoal", "FieldGoalsAttempted", "FieldGoalsPercentage", "3Point", "3PointAttempted", "3PointPercentage", "Points", "Wins", "Losses", "Pace", "Year")
df_2021 <- df_2021 %>%
relocate(Wins, Losses, .before = MinutesPlayed) %>%
relocate(Year, .before = Games) %>% relocate(Pace, .before = Points)
df41 <- read.csv("C:\\Users\\LENOVO\\Downloads\\Regression Model\\Project\\sportsref_download (3).csv")
df4 <- read.csv("C:\\Users\\LENOVO\\Downloads\\2022-23.csv")
df41 <- df41 %>%
mutate(across(everything(), ~ gsub("\\*", "", .)))
df4 <- df4 %>%
mutate(across(everything(), ~ gsub("\\*", "", .)))
df_new4 <- subset(df4, select = -c(Age, Rk, PW, PL, MOV, SOS, SRS, ORtg, DRtg, NRtg, FTr, X3PAr, TS., X, Offense.Four.Factors, X.1, X.2, X.3, X.4, Defense.Four.Factors, X.5, X.6, X.7, X.8, X.9, X.10, X.11))
merged_df3 <- merge(df41, df_new4, by = "Team")
df_2022 <- subset(merged_df3, select = -c(Rk, X2P, X2PA, X2P., FT, FTA, FT., ORB, DRB, TRB, AST, STL, BLK, TOV, PF))
df_2022 <- df_2022 %>% mutate(Year = 2022)
names(df_2022) <- c("Team", "Games", "MinutesPlayed", "FieldGoal", "FieldGoalsAttempted", "FieldGoalsPercentage", "3Point", "3PointAttempted", "3PointPercentage", "Points", "Wins", "Losses", "Pace", "Year")
df_2022 <- df_2022 %>%
relocate(Wins, Losses, .before = MinutesPlayed) %>%
relocate(Year, .before = Games) %>% relocate(Pace, .before = Points)
# Combine datasets
combined_data <- bind_rows(combined_data, df_2021, df_2022)
“I’ve merged the datasets from 2019 to 2022, and now the data is ready for exploration, visualization, and regression modeling.”
data <- combined_data %>%
mutate(
Team = as.factor(Team),
Year = as.factor(Year)
)
str(data)
## 'data.frame': 120 obs. of 14 variables:
## $ Team : Factor w/ 30 levels "Atlanta Hawks",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Year : Factor w/ 4 levels "2019","2020",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Games : chr "67" "72" "72" "65" ...
## $ Wins : chr "20" "48" "35" "23" ...
## $ Losses : chr "47" "24" "37" "42" ...
## $ MinutesPlayed : chr "243" "242.1" "242.8" "242.3" ...
## $ FieldGoal : chr "40.6" "41.3" "40.4" "37.3" ...
## $ FieldGoalsAttempted : chr "90.6" "89.6" "90.3" "85.9" ...
## $ FieldGoalsPercentage: chr "0.449" "0.461" "0.448" "0.434" ...
## $ 3Point : chr "12" "12.6" "13.1" "12.1" ...
## $ 3PointAttempted : chr "36.1" "34.5" "38.1" "34.3" ...
## $ 3PointPercentage : chr "0.333" "0.364" "0.343" "0.352" ...
## $ Pace : chr "103" "99.5" "101.4" "95.8" ...
## $ Points : chr "111.8" "113.7" "111.8" "102.9" ...
Columns like Games, Wins, Losses, MinutesPlayed, FieldGoal, etc., are stored as character types instead of numeric. This prevents you from performing mathematical operations or analyses directly on these columns.We have to change the datasets to mumeric.
# Convert columns to numeric
data$Wins <- as.numeric(as.character(data$Wins))
data$Losses <- as.numeric(as.character(data$Losses))
data$MinutesPlayed <- as.numeric(as.character(data$MinutesPlayed))
data$FieldGoal <- as.numeric(as.character(data$FieldGoal))
data$FieldGoalsAttempted <- as.numeric(as.character(data$FieldGoalsAttempted))
data$FieldGoalsPercentage <- as.numeric(as.character(data$FieldGoalsPercentage))
data$Pace <- as.numeric(as.character(data$Pace))
data$`3Point` <- as.numeric(as.character(data$`3Point`))
data$`3PointAttempted` <- as.numeric(as.character(data$`3PointAttempted`))
data$`3PointPercentage` <- as.numeric(as.character(data$`3PointPercentage`))
data$Points <- as.numeric(as.character(data$Points))
data$Games <- as.numeric(as.character(data$Games))
data$FieldGoalsPercentage <- as.numeric(as.character(data$FieldGoalsPercentage))
We need winning percentage for the y variable. Incorporating winning percentage into your analysis helps quantify the success of various strategies and provides a direct measure of team performance, which is essential for understanding the impact of 3-point shooting and other metrics.
Formula for calculate winning percentage: WP = Wins / Wins + Losses.
data$WinningPercentage <- round(data$Wins / (data$Wins + data$Losses), 3)
str(data)
## 'data.frame': 120 obs. of 15 variables:
## $ Team : Factor w/ 30 levels "Atlanta Hawks",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Year : Factor w/ 4 levels "2019","2020",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Games : num 67 72 72 65 65 65 75 73 66 65 ...
## $ Wins : num 20 48 35 23 22 19 43 46 20 15 ...
## $ Losses : num 47 24 37 42 43 46 32 27 46 50 ...
## $ MinutesPlayed : num 243 242 243 242 241 ...
## $ FieldGoal : num 40.6 41.3 40.4 37.3 39.6 40.3 41.7 42 39.3 38.6 ...
## $ FieldGoalsAttempted : num 90.6 89.6 90.3 85.9 88.6 87.9 90.3 88.9 85.7 88.2 ...
## $ FieldGoalsPercentage: num 0.449 0.461 0.448 0.434 0.447 0.458 0.461 0.473 0.459 0.438 ...
## $ 3Point : num 12 12.6 13.1 12.1 12.2 11.2 15.1 11 12 10.4 ...
## $ 3PointAttempted : num 36.1 34.5 38.1 34.3 35.1 31.8 41.3 30.6 32.7 31.3 ...
## $ 3PointPercentage : num 0.333 0.364 0.343 0.352 0.348 0.351 0.367 0.359 0.367 0.334 ...
## $ Pace : num 103 99.5 101.4 95.8 99.7 ...
## $ Points : num 112 114 112 103 107 ...
## $ WinningPercentage : num 0.299 0.667 0.486 0.354 0.338 0.292 0.573 0.63 0.303 0.231 ...
We can see that every variable in the dataset is numerical, with the exception of the team and year variables. so that we can quickly analyze this dataset.
dim_info <- dim(data)
num_rows <- dim_info[1]
num_cols <- dim_info[2]
cat("Dimension of training set: Number of rows:", num_rows, ", Number of cols:", num_cols, "\n")
## Dimension of training set: Number of rows: 120 , Number of cols: 15
sapply(data, function(x) sum(is.na(x)))
## Team Year Games
## 0 0 0
## Wins Losses MinutesPlayed
## 0 0 0
## FieldGoal FieldGoalsAttempted FieldGoalsPercentage
## 0 0 0
## 3Point 3PointAttempted 3PointPercentage
## 0 0 0
## Pace Points WinningPercentage
## 0 0 0
The dataset contains no missing values.
hist(data$Points, main="Distribution of Points", xlab="Points", col="lightblue")
hist(data$'3PointPercentage', main="Distribution of 3-Point Percentage", xlab="3-Point Percentage", col="lightgreen")
hist(data$Pace, main = "Distribution of Pace", xlab = "Pace", col = "lightblue")
Distribution Of Points:
From the histogram, there is systematic shape(Normal Distribution) it may indicate a consistent pattern in the distribution of points. This could suggest that most teams score within a specific range, showing a regularity in scoring performance across the dataset.
Range: The spread from 105 to 120 points indicates that the majority of the teams score between these values. This range represents the central cluster of scoring performances.
The center of the distribution at 112 points indicates that the average or most common point value for the teams in your dataset is around 112. This suggests that a typical team’s score is close to this value.
Frequency The peak at between 110-115 indicates that this is the most common point value among the teams in your dataset. This suggests that the majority of teams score 113 points more frequently than any other specific score. some team has above 120 points and some team has below 105 with low frequency.
Distribution Of 3 Point Percentage
Distribution Type: The systematic shape with a peak at 0.36 and a central tendency around 0.35 to 0.36, combined with the spread, suggests a normal-like distribution or a single-peaked distribution. This indicates that most teams have shooting percentages clustered around 0.35 to 0.36, with decreasing frequencies as you move away from this center.
Frequency: With the highest frequency at 0.36 and a central spread, it appears that most teams have a 3-point shooting percentage close to this value.
Spread: The range from 0.32 to 0.39 shows there is some variability in shooting accuracy, but most teams are concentrated around the center.
Distribution of Pace
Central Tendency: With a center at 100, the typical pace for most teams is around this value. Spread: The range from 96 to 106 indicates overall variability in team paces.
Peak Frequency: The highest frequency being above 30 suggests a common pace range where most teams fall.
Uniform with a Gap: The histogram shows a uniform distribution with a notable gap between 104 and 105, suggesting that these paces are less common.
cor(data[, sapply(data, is.numeric)])
## Games Wins Losses MinutesPlayed
## Games 1.00000000 0.43552713 0.10735874 -0.200638567
## Wins 0.43552713 1.00000000 -0.84821526 -0.132771763
## Losses 0.10735874 -0.84821526 1.00000000 0.028589800
## MinutesPlayed -0.20063857 -0.13277176 0.02858980 1.000000000
## FieldGoal 0.29177543 0.46075234 -0.33721200 -0.009574205
## FieldGoalsAttempted -0.09601727 -0.09939768 0.05328682 0.067585365
## FieldGoalsPercentage 0.42268781 0.62533384 -0.44196080 -0.063810885
## 3Point 0.13675058 0.32772826 -0.28150500 0.098302546
## 3PointAttempted 0.09937635 0.15700074 -0.11493144 0.116468257
## 3PointPercentage 0.11233182 0.51918411 -0.50733065 -0.017942695
## Pace -0.31439369 -0.13161744 -0.03961716 -0.040051152
## Points 0.28788418 0.54079786 -0.42790973 0.067675101
## WinningPercentage 0.22274861 0.97150327 -0.94193749 -0.105845675
## FieldGoal FieldGoalsAttempted FieldGoalsPercentage
## Games 0.291775426 -0.09601727 0.42268781
## Wins 0.460752338 -0.09939768 0.62533384
## Losses -0.337211998 0.05328682 -0.44196080
## MinutesPlayed -0.009574205 0.06758536 -0.06381089
## FieldGoal 1.000000000 0.55194183 0.72262785
## FieldGoalsAttempted 0.551941829 1.00000000 -0.17671966
## FieldGoalsPercentage 0.722627848 -0.17671966 1.00000000
## 3Point 0.044467799 0.03393504 0.02100782
## 3PointAttempted -0.069383275 0.14474495 -0.20346909
## 3PointPercentage 0.306161031 -0.26926017 0.57985472
## Pace 0.441195068 0.66343274 -0.02513430
## Points 0.820016867 0.41655798 0.62340438
## WinningPercentage 0.424699163 -0.08040637 0.56731399
## 3Point 3PointAttempted 3PointPercentage Pace
## Games 0.13675058 0.09937635 0.11233182 -0.31439369
## Wins 0.32772826 0.15700074 0.51918411 -0.13161744
## Losses -0.28150500 -0.11493144 -0.50733065 -0.03961716
## MinutesPlayed 0.09830255 0.11646826 -0.01794270 -0.04005115
## FieldGoal 0.04446780 -0.06938327 0.30616103 0.44119507
## FieldGoalsAttempted 0.03393504 0.14474495 -0.26926017 0.66343274
## FieldGoalsPercentage 0.02100782 -0.20346909 0.57985472 -0.02513430
## 3Point 1.00000000 0.93405295 0.40514233 0.18216296
## 3PointAttempted 0.93405295 1.00000000 0.05451085 0.25425550
## 3PointPercentage 0.40514233 0.05451085 1.00000000 -0.11681030
## Pace 0.18216296 0.25425550 -0.11681030 1.00000000
## Points 0.45961510 0.33812601 0.43200539 0.51850681
## WinningPercentage 0.31690962 0.14327361 0.52890710 -0.04539831
## Points WinningPercentage
## Games 0.2878842 0.22274861
## Wins 0.5407979 0.97150327
## Losses -0.4279097 -0.94193749
## MinutesPlayed 0.0676751 -0.10584568
## FieldGoal 0.8200169 0.42469916
## FieldGoalsAttempted 0.4165580 -0.08040637
## FieldGoalsPercentage 0.6234044 0.56731399
## 3Point 0.4596151 0.31690962
## 3PointAttempted 0.3381260 0.14327361
## 3PointPercentage 0.4320054 0.52890710
## Pace 0.5185068 -0.04539831
## Points 1.0000000 0.51626899
## WinningPercentage 0.5162690 1.00000000
WinningPercentage and Wins:
0.972: A very strong positive correlation. This indicates that teams with more wins tend to have a higher winning percentage.
WinningPercentage and Losses:
-0.942: A very strong negative correlation. As losses increase, the winning percentage decreases, which is expected.
3PointPercentage and Points:
0.820: Strong positive correlation. Teams with a higher 3-point shooting percentage tend to score more points.
Points and WinningPercentage:
0.516: Moderate positive correlation. Teams that score more points tend to have a higher winning percentage.
plot(data$'3PointPercentage' ~ data$WinningPercentage, main="3-Point Perecentage vs Winning Percentage", xlab="Winning Percentage", ylab="3-Point Percentage")
abline(lm(data$'3PointPercentage' ~ data$WinningPercentage), col="red")
The scatter plot shows a positive trend, meaning as the winning percentage increases, the 3-point field goal percentage also tends to increase.
pairs(data[, c("Points", "3PointPercentage", "FieldGoalsPercentage", "Pace")], main="Pairwise Plot")
Average Metrics by Year:
library(dplyr)
data %>%
group_by(Year) %>%
summarise(across(where(is.numeric), mean, na.rm = TRUE))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(where(is.numeric), mean, na.rm = TRUE)`.
## ℹ In group 1: `Year = 2019`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
## # A tibble: 4 × 14
## Year Games Wins Losses MinutesPlayed FieldGoal FieldGoalsAttempted
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2019 70.6 35.3 35.3 242. 40.8 88.8
## 2 2020 70.6 35.3 35.3 242. 40.8 88.8
## 3 2021 82 41 41 241. 40.6 88.1
## 4 2022 82 41 41 242. 42.0 88.3
## # ℹ 7 more variables: FieldGoalsPercentage <dbl>, `3Point` <dbl>,
## # `3PointAttempted` <dbl>, `3PointPercentage` <dbl>, Pace <dbl>,
## # Points <dbl>, WinningPercentage <dbl>
The number of games increased from 70.6 in 2019 to 82 in 2021 and 2022. This could be due to changes in the season length or adjustments for the pandemic.
Wins and Losses:
Wins and losses are balanced (41 wins and 41 losses) for 2021 and 2022, indicating that teams have had a more balanced performance compared to 2019 and 2020.
Minutes Played:
The minutes played per game remain fairly consistent across the years, with a slight decrease in 2021 compared to 2019 and 2022.
Field Goals and 3-Point Statistics:
Field Goals: The percentage of field goals attempted and made has increased slightly from 2019 to 2022.
3-Point Statistics: There is a slight increase in 3-point field goals and percentage over the years, indicating a growing emphasis on the 3-point shot.
Pace:
The pace has slightly decreased from 2019 to 2021 but increased again in 2022. This variation may reflect changes in playing style or game strategies over the years.
Points:
Points scored per game have generally increased, indicating higher scoring games or improved offensive strategies.
Winning Percentage:
The winning percentage has been relatively stable, with a small increase in 2021 and 2022. This stability suggests that winning percentages have not fluctuated drastically despite changes in other metrics.
data %>%
group_by(Team) %>%
summarise(AverageWinningPercentage = mean(WinningPercentage, na.rm = TRUE)) %>%
arrange(desc(AverageWinningPercentage))
## # A tibble: 30 × 2
## Team AverageWinningPercentage
## <fct> <dbl>
## 1 Milwaukee Bucks 0.716
## 2 Boston Celtics 0.663
## 3 Toronto Raptors 0.639
## 4 Denver Nuggets 0.623
## 5 Philadelphia 76ers 0.615
## 6 Los Angeles Clippers 0.603
## 7 Los Angeles Lakers 0.598
## 8 Miami Heat 0.597
## 9 Utah Jazz 0.568
## 10 Phoenix Suns 0.565
## # ℹ 20 more rows
We can see Milawaukee Bucks has the highest average winning percentage from 2019-2022. In the meantime Detroit Pistons registered lowest average winning percentage which is 0.27.Other teams has 0.4, 0.4 and 0.6 percentage respectively.
boxplot(Points ~ Year, data=data, main="Points by Year", xlab="Year", ylab="Points")
boxplot(`3PointPercentage` ~ Year, data=data, main="3-Point Percentage by Year", xlab="Year", ylab="3-Point Percentage")
library(ggplot2)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.3.3
cor_matrix <- cor(data[, sapply(data, is.numeric)])
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.3.3
## corrplot 0.92 loaded
# Plot the correlation matrix with adjustments
corrplot(cor_matrix, method = "color", type = "upper",
tl.cex = 0.5, # Size of the text labels
addCoef.col = "black", # Add correlation coefficients in black
diag = FALSE, # Remove diagonal elements
tl.col = "black", # Color of the text labels
cl.cex = 0.5, # Size of the color legend
number.cex = 0.5) # Size of the correlation coefficients
library(ggplot2)
library(reshape2)
data_melted <- melt(data, id.vars = c("Team", "Year"))
ggplot(data_melted, aes(x=Year, y=variable, fill=value)) +
geom_tile() +
scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = median(data_melted$value, na.rm=TRUE)) +
labs(title="Heatmap of Metrics by Year", x="Year", y="Metric")
library(ggplot2)
ggplot(data, aes(x=Year, y=Points, group=Team)) +
geom_line(aes(color=Team)) +
labs(title="Points Trend Over Time", x="Year", y="Points") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Model 1: Impact on Winning Percentage
# Linear model for Winning Percentage
model_winning_percentage <- lm(WinningPercentage ~ `3Point` + `3PointAttempted` + `3PointPercentage` + Games + FieldGoal + Pace + Team, data = data)
stargazer(model_winning_percentage,
align = TRUE,
type = 'text',
dep.var.labels = c("Winning Percentage"),
column.labels = c("Model 1: Winning Percentage"),
title = "Linear Regression Models",
no.space = TRUE,
out = "model_summary.txt")
##
## Linear Regression Models
## ======================================================
## Dependent variable:
## ---------------------------
## Winning Percentage
## Model 1: Winning Percentage
## ------------------------------------------------------
## `3Point` -0.426**
## (0.173)
## `3PointAttempted` 0.156**
## (0.061)
## `3PointPercentage` 17.896***
## (6.023)
## Games -0.003
## (0.002)
## FieldGoal 0.045***
## (0.011)
## Pace -0.024***
## (0.008)
## TeamBoston Celtics 0.175***
## (0.065)
## TeamBrooklyn Nets 0.095
## (0.061)
## TeamCharlotte Hornets 0.007
## (0.064)
## TeamChicago Bulls -0.021
## (0.063)
## TeamCleveland Cavaliers -0.015
## (0.065)
## TeamDallas Mavericks 0.072
## (0.069)
## TeamDenver Nuggets 0.074
## (0.065)
## TeamDetroit Pistons -0.091
## (0.065)
## TeamGolden State Warriors 0.051
## (0.064)
## TeamHouston Rockets 0.081
## (0.065)
## TeamIndiana Pacers 0.023
## (0.061)
## TeamLos Angeles Clippers 0.109*
## (0.065)
## TeamLos Angeles Lakers 0.185***
## (0.060)
## TeamMemphis Grizzlies 0.121*
## (0.061)
## TeamMiami Heat 0.148**
## (0.068)
## TeamMilwaukee Bucks 0.261***
## (0.063)
## TeamMinnesota Timberwolves 0.011
## (0.062)
## TeamNew Orleans Pelicans 0.005
## (0.061)
## TeamNew York Knicks 0.052
## (0.065)
## TeamOklahoma City Thunder 0.117*
## (0.062)
## TeamOrlando Magic 0.105
## (0.065)
## TeamPhiladelphia 76ers 0.106
## (0.067)
## TeamPhoenix Suns 0.091
## (0.062)
## TeamPortland Trail Blazers -0.027
## (0.063)
## TeamSacramento Kings -0.004
## (0.061)
## TeamSan Antonio Spurs -0.103
## (0.064)
## TeamToronto Raptors 0.199***
## (0.062)
## TeamUtah Jazz 0.084
## (0.064)
## TeamWashington Wizards -0.030
## (0.062)
## Constant -5.345**
## (2.222)
## ------------------------------------------------------
## Observations 120
## R2 0.748
## Adjusted R2 0.643
## Residual Std. Error 0.085 (df = 84)
## F Statistic 7.135*** (df = 35; 84)
## ======================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Model 2: Imapct on Total Points Scored
model_total_points <- lm(Points ~ `3Point` + `3PointAttempted` + `3PointPercentage` + Games + FieldGoal + Pace + Team, data = data)
stargazer(model_total_points,
align = TRUE,
type = 'text',
dep.var.labels = c("Winning Percentage"),
column.labels = c("Model 1: Winning Percentage"),
title = "Linear Regression Models",
no.space = TRUE,
out = "model_summary.txt")
##
## Linear Regression Models
## ======================================================
## Dependent variable:
## ---------------------------
## Winning Percentage
## Model 1: Winning Percentage
## ------------------------------------------------------
## `3Point` -3.800
## (2.709)
## `3PointAttempted` 1.559
## (0.960)
## `3PointPercentage` 185.147*
## (94.539)
## Games 0.064*
## (0.032)
## FieldGoal 1.844***
## (0.179)
## Pace 0.402***
## (0.132)
## TeamBoston Celtics 0.656
## (1.013)
## TeamBrooklyn Nets -0.504
## (0.959)
## TeamCharlotte Hornets -0.850
## (1.001)
## TeamChicago Bulls -1.886*
## (0.984)
## TeamCleveland Cavaliers -1.347
## (1.013)
## TeamDallas Mavericks 1.357
## (1.078)
## TeamDenver Nuggets -1.270
## (1.028)
## TeamDetroit Pistons -0.353
## (1.021)
## TeamGolden State Warriors -0.421
## (0.997)
## TeamHouston Rockets 1.041
## (1.014)
## TeamIndiana Pacers -2.017**
## (0.964)
## TeamLos Angeles Clippers -0.020
## (1.022)
## TeamLos Angeles Lakers -0.299
## (0.948)
## TeamMemphis Grizzlies -2.014**
## (0.960)
## TeamMiami Heat 1.327
## (1.073)
## TeamMilwaukee Bucks -0.701
## (0.985)
## TeamMinnesota Timberwolves 0.064
## (0.970)
## TeamNew Orleans Pelicans -0.848
## (0.956)
## TeamNew York Knicks 0.275
## (1.014)
## TeamOklahoma City Thunder -0.019
## (0.975)
## TeamOrlando Magic -0.378
## (1.013)
## TeamPhiladelphia 76ers 0.076
## (1.050)
## TeamPhoenix Suns -0.706
## (0.971)
## TeamPortland Trail Blazers -0.498
## (0.993)
## TeamSacramento Kings -0.796
## (0.962)
## TeamSan Antonio Spurs -2.035**
## (0.998)
## TeamToronto Raptors 0.140
## (0.976)
## TeamUtah Jazz 0.363
## (1.010)
## TeamWashington Wizards -0.412
## (0.973)
## Constant -81.150**
## (34.880)
## ------------------------------------------------------
## Observations 120
## R2 0.913
## Adjusted R2 0.877
## Residual Std. Error 1.330 (df = 84)
## F Statistic 25.197*** (df = 35; 84)
## ======================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Model 3: Impact on Field-Goal Percentage
model_field_goal <- lm(FieldGoalsPercentage ~ `3Point` + `3PointAttempted` + `3PointPercentage` + Games + FieldGoal + Pace + Team, data = data)
stargazer(model_field_goal,
align = TRUE,
type = 'text',
dep.var.labels = c("Winning Percentage"),
column.labels = c("Model 1: Winning Percentage"),
title = "Linear Regression Models",
no.space = TRUE,
out = "model_summary.txt")
##
## Linear Regression Models
## ======================================================
## Dependent variable:
## ---------------------------
## Winning Percentage
## Model 1: Winning Percentage
## ------------------------------------------------------
## `3Point` -0.004
## (0.013)
## `3PointAttempted` 0.0005
## (0.005)
## `3PointPercentage` 0.548
## (0.449)
## Games 0.001***
## (0.0002)
## FieldGoal 0.005***
## (0.001)
## Pace -0.001
## (0.001)
## TeamBoston Celtics -0.001
## (0.005)
## TeamBrooklyn Nets 0.002
## (0.005)
## TeamCharlotte Hornets -0.004
## (0.005)
## TeamChicago Bulls 0.002
## (0.005)
## TeamCleveland Cavaliers 0.005
## (0.005)
## TeamDallas Mavericks 0.004
## (0.005)
## TeamDenver Nuggets 0.006
## (0.005)
## TeamDetroit Pistons -0.002
## (0.005)
## TeamGolden State Warriors 0.002
## (0.005)
## TeamHouston Rockets 0.007
## (0.005)
## TeamIndiana Pacers -0.002
## (0.005)
## TeamLos Angeles Clippers -0.005
## (0.005)
## TeamLos Angeles Lakers 0.009**
## (0.005)
## TeamMemphis Grizzlies -0.005
## (0.005)
## TeamMiami Heat 0.004
## (0.005)
## TeamMilwaukee Bucks 0.005
## (0.005)
## TeamMinnesota Timberwolves 0.003
## (0.005)
## TeamNew Orleans Pelicans -0.002
## (0.005)
## TeamNew York Knicks -0.005
## (0.005)
## TeamOklahoma City Thunder -0.001
## (0.005)
## TeamOrlando Magic -0.003
## (0.005)
## TeamPhiladelphia 76ers 0.0003
## (0.005)
## TeamPhoenix Suns -0.003
## (0.005)
## TeamPortland Trail Blazers -0.006
## (0.005)
## TeamSacramento Kings 0.003
## (0.005)
## TeamSan Antonio Spurs -0.010**
## (0.005)
## TeamToronto Raptors -0.008
## (0.005)
## TeamUtah Jazz 0.007
## (0.005)
## TeamWashington Wizards -0.001
## (0.005)
## Constant 0.107
## (0.166)
## ------------------------------------------------------
## Observations 120
## R2 0.861
## Adjusted R2 0.803
## Residual Std. Error 0.006 (df = 84)
## F Statistic 14.861*** (df = 35; 84)
## ======================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
Examine metrics such as Adjusted R-squared, AIC, and BIC to assess model fit and compare performance.
# Model comparison for Winning Percentage
cat("Adjusted R-squared for Winning Percentage Model: ", summary(model_winning_percentage)$adj.r.squared, "\n")
## Adjusted R-squared for Winning Percentage Model: 0.6434271
cat("AIC for Winning Percentage Model: ", AIC(model_winning_percentage), "\n")
## AIC for Winning Percentage Model: -220.7123
# Model comparison for Total Points Scored
cat("Adjusted R-squared for Total Points Scored Model: ", summary(model_total_points)$adj.r.squared, "\n")
## Adjusted R-squared for Total Points Scored Model: 0.8767973
cat("AIC for Total Points Scored Model: ", AIC(model_total_points), "\n")
## AIC for Total Points Scored Model: 440.1166
# Model comparison for Field-Goal Percentage
cat("Adjusted R-squared for Field-Goal Percentage Model: ", summary(model_field_goal)$adj.r.squared, "\n")
## Adjusted R-squared for Field-Goal Percentage Model: 0.8030233
cat("AIC for Field-Goal Percentage Model: ", AIC(model_field_goal), "\n")
## AIC for Field-Goal Percentage Model: -843.8004
library(car)
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.3.3
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
# Multicollinearity check for Winning Percentage model
vif(model_winning_percentage)
## GVIF Df GVIF^(1/(2*Df))
## `3Point` 992.581564 1 31.505262
## `3PointAttempted` 825.824502 1 28.737162
## `3PointPercentage` 126.813306 1 11.261141
## Games 2.706374 1 1.645106
## FieldGoal 4.721924 1 2.172999
## Pace 5.351204 1 2.313267
## Team 24.826619 29 1.056940
# Residual plots for Winning Percentage model
par(mfrow = c(2, 2))
plot(model_winning_percentage)
# Visualize coefficients
library(broom)
tidy_model_winning_percentage <- tidy(model_winning_percentage)
tidy_model_total_points <- tidy(model_total_points)
tidy_model_field_goal_percentage <- tidy(model_field_goal)
# Load necessary library
library(plm)
## Warning: package 'plm' was built under R version 4.3.3
##
## Attaching package: 'plm'
## The following objects are masked from 'package:dplyr':
##
## between, lag, lead
# Convert to panel data
pdata <- pdata.frame(data, index = c("Team", "Year"))
# Fixed effects models
fe_model_winning_percentage <- plm(WinningPercentage ~ `X3Point` + `X3PointAttempted` + `X3PointPercentage` + Games + FieldGoal + Pace,
data = pdata, model = "within")
fe_model_total_points <- plm(Points ~ `X3Point` + `X3PointAttempted` + `X3PointPercentage` + Games + FieldGoal + Pace,
data = pdata, model = "within")
fe_model_field_goal_percentage <- plm(FieldGoalsPercentage ~ `X3Point` + `X3PointAttempted` + `X3PointPercentage` + Games + FieldGoal + Pace,
data = pdata, model = "within")
# Random effects models
re_model_winning_percentage <- plm(WinningPercentage ~ `X3Point` + `X3PointAttempted` + `X3PointPercentage` + Games + FieldGoal + Pace,
data = pdata, model = "random")
re_model_total_points <- plm(Points ~ `X3Point` + `X3PointAttempted` + `X3PointPercentage` + Games + FieldGoal + Pace,
data = pdata, model = "random")
re_model_field_goal_percentage <- plm(FieldGoalsPercentage ~ `X3Point` + `X3PointAttempted` + `X3PointPercentage` + Games + FieldGoal + Pace,
data = pdata, model = "random")
# Hausman test
phtest(fe_model_winning_percentage, re_model_winning_percentage)
##
## Hausman Test
##
## data: WinningPercentage ~ X3Point + X3PointAttempted + X3PointPercentage + ...
## chisq = 2.4929, df = 6, p-value = 0.8693
## alternative hypothesis: one model is inconsistent
phtest(fe_model_total_points, re_model_total_points)
##
## Hausman Test
##
## data: Points ~ X3Point + X3PointAttempted + X3PointPercentage + Games + ...
## chisq = 0.31311, df = 6, p-value = 0.9994
## alternative hypothesis: one model is inconsistent
phtest(fe_model_field_goal_percentage, re_model_field_goal_percentage)
##
## Hausman Test
##
## data: FieldGoalsPercentage ~ X3Point + X3PointAttempted + X3PointPercentage + ...
## chisq = 5.6637, df = 6, p-value = 0.4619
## alternative hypothesis: one model is inconsistent
# Predictions for fixed effects models
predictions_fe_winning <- predict(fe_model_winning_percentage, newdata = pdata)
predictions_fe_total_points <- predict(fe_model_total_points, newdata = pdata)
predictions_fe_field_goal_percentage <- predict(fe_model_field_goal_percentage, newdata = pdata)
# Comparison with actual values
comparison_winning_percentage <- data.frame(Actual = pdata$WinningPercentage, Predicted = predictions_fe_winning)
comparison_winning_percentage <- round(comparison_winning_percentage, 3)
comparison_total_points <- data.frame(Actual = pdata$Points, Predicted = predictions_fe_total_points)
comparison_total_points <- round(comparison_total_points)
comparison_field_goal_percentage <- data.frame(Actual = pdata$FieldGoalsPercentage, Predicted = predictions_fe_field_goal_percentage)
comparison_field_goal_percentage <- round(comparison_field_goal_percentage, 3)
head(comparison_winning_percentage)
## Actual Predicted
## Atlanta Hawks-2019 0.299 0.310
## Atlanta Hawks-2020 0.299 0.310
## Atlanta Hawks-2021 0.524 0.521
## Atlanta Hawks-2022 0.500 0.481
## Boston Celtics-2019 0.667 0.636
## Boston Celtics-2020 0.667 0.636
head(comparison_total_points)
## Actual Predicted
## Atlanta Hawks-2019 112 112
## Atlanta Hawks-2020 112 112
## Atlanta Hawks-2021 114 114
## Atlanta Hawks-2022 118 119
## Boston Celtics-2019 114 114
## Boston Celtics-2020 114 114
head(comparison_field_goal_percentage)
## Actual Predicted
## Atlanta Hawks-2019 0.449 0.444
## Atlanta Hawks-2020 0.449 0.444
## Atlanta Hawks-2021 0.470 0.477
## Atlanta Hawks-2022 0.483 0.487
## Boston Celtics-2019 0.461 0.465
## Boston Celtics-2020 0.461 0.465
library(ggplot2)
ggplot(comparison_winning_percentage, aes(x = Predicted, y = Actual)) +
geom_point(color = "blue", size = 2, alpha = 0.6) + # Adding color, size, and transparency
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed", size = 1) +
labs(title = "Predicted vs Actual Winning Percentage", x = "Predicted Winning Percentage", y = "Actual Winning Percentage") +
theme_minimal() +
theme(
plot.title = element_text(size = 14, face = "bold"),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10)
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(comparison_total_points, aes(x = Predicted, y = Actual)) +
geom_point(color = "green", size = 2, alpha = 0.6) + # Adding color, size, and transparency
geom_abline(intercept = 0, slope = 1, color = "purple", linetype = "dashed", size = 1) +
labs(title = "Predicted vs Actual Total Points Scored", x = "Predicted Total Points", y = "Actual Total Points") +
theme_light() +
theme(
plot.title = element_text(size = 14, face = "bold"),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10)
)
ggplot(comparison_field_goal_percentage, aes(x = Predicted, y = Actual)) +
geom_point(color = "orange", size = 2, alpha = 0.6) + # Adding color, size, and transparency
geom_abline(intercept = 0, slope = 1, color = "darkblue", linetype = "dashed", size = 1) +
labs(title = "Predicted vs Actual Field-Goal Percentage", x = "Predicted Field-Goal Percentage", y = "Actual Field-Goal Percentage") +
theme_classic() +
theme(
plot.title = element_text(size = 14, face = "bold"),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10)
)