player_data <-read.csv('C:/Users/rohan/OneDrive/Desktop/INTRO TO STATISTICS IN R/DATA SETS/Datasets/Data/Nba_all_seasons_1996_2021.csv')
summary(player_data)
## X player_name team_abbreviation age
## Min. : 0 Length:12305 Length:12305 Min. :18.00
## 1st Qu.: 3076 Class :character Class :character 1st Qu.:24.00
## Median : 6152 Mode :character Mode :character Median :26.00
## Mean : 6152 Mean :27.08
## 3rd Qu.: 9228 3rd Qu.:30.00
## Max. :12304 Max. :44.00
## player_height player_weight college country
## Min. :160.0 Min. : 60.33 Length:12305 Length:12305
## 1st Qu.:193.0 1st Qu.: 90.72 Class :character Class :character
## Median :200.7 Median : 99.79 Mode :character Mode :character
## Mean :200.6 Mean :100.37
## 3rd Qu.:208.3 3rd Qu.:108.86
## Max. :231.1 Max. :163.29
## draft_year draft_round draft_number gp
## Length:12305 Length:12305 Length:12305 Min. : 1.00
## Class :character Class :character Class :character 1st Qu.:31.00
## Mode :character Mode :character Mode :character Median :57.00
## Mean :51.29
## 3rd Qu.:73.00
## Max. :85.00
## pts reb ast net_rating
## Min. : 0.000 Min. : 0.000 Min. : 0.000 Min. :-250.000
## 1st Qu.: 3.600 1st Qu.: 1.800 1st Qu.: 0.600 1st Qu.: -6.400
## Median : 6.700 Median : 3.000 Median : 1.200 Median : -1.300
## Mean : 8.173 Mean : 3.559 Mean : 1.814 Mean : -2.256
## 3rd Qu.:11.500 3rd Qu.: 4.700 3rd Qu.: 2.400 3rd Qu.: 3.200
## Max. :36.100 Max. :16.300 Max. :11.700 Max. : 300.000
## oreb_pct dreb_pct usg_pct ts_pct
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.02100 1st Qu.:0.096 1st Qu.:0.1490 1st Qu.:0.4800
## Median :0.04100 Median :0.131 Median :0.1810 Median :0.5240
## Mean :0.05447 Mean :0.141 Mean :0.1849 Mean :0.5111
## 3rd Qu.:0.08400 3rd Qu.:0.180 3rd Qu.:0.2170 3rd Qu.:0.5610
## Max. :1.00000 Max. :1.000 Max. :1.0000 Max. :1.5000
## ast_pct season
## Min. :0.0000 Length:12305
## 1st Qu.:0.0660 Class :character
## Median :0.1030 Mode :character
## Mean :0.1314
## 3rd Qu.:0.1780
## Max. :1.0000
season_2006_07 <- subset(player_data, season == '2006-07')
top_players <- head(season_2006_07[order(-season_2006_07$reb), ], 10)
print(top_players)
## X player_name team_abbreviation age player_height player_weight
## 4594 4593 Kevin Garnett MIN 31 210.82 114.7588
## 4814 4813 Tyson Chandler NOK 24 215.90 106.5941
## 4589 4588 Dwight Howard ORL 21 210.82 120.2019
## 4716 4715 Marcus Camby DEN 33 210.82 106.5941
## 4844 4843 Carlos Boozer UTA 25 205.74 120.6555
## 4559 4558 Emeka Okafor CHA 24 208.28 114.3052
## 4785 4784 Al Jefferson BOS 22 208.28 116.1196
## 4854 4853 Chris Bosh TOR 23 208.28 104.3262
## 4888 4887 Ben Wallace CHI 32 205.74 108.8621
## 4763 4762 Tim Duncan SAS 31 210.82 117.9339
## college country draft_year draft_round draft_number gp
## 4594 None USA 1995 1 5 76
## 4814 None USA 2001 1 2 73
## 4589 None USA 2004 1 1 82
## 4716 Massachusetts USA 1996 1 2 70
## 4844 Duke USA 2002 2 34 74
## 4559 Connecticut USA 2004 1 2 67
## 4785 None USA 2004 1 15 69
## 4854 Georgia Tech USA 2003 1 4 69
## 4888 Virginia Union USA Undrafted Undrafted Undrafted 77
## 4763 Wake Forest US Virgin Islands 1997 1 1 80
## pts reb ast net_rating oreb_pct dreb_pct usg_pct ts_pct ast_pct season
## 4594 22.4 12.8 4.1 -0.5 0.077 0.308 0.276 0.546 0.190 2006-07
## 4814 9.5 12.4 0.9 -0.9 0.142 0.273 0.122 0.620 0.040 2006-07
## 4589 17.6 12.3 1.9 0.7 0.120 0.288 0.226 0.619 0.092 2006-07
## 4716 11.2 11.7 3.2 3.2 0.080 0.300 0.159 0.519 0.137 2006-07
## 4844 20.9 11.7 3.0 2.1 0.113 0.285 0.265 0.588 0.158 2006-07
## 4559 14.4 11.3 1.2 -2.2 0.126 0.249 0.192 0.554 0.060 2006-07
## 4785 16.0 11.0 1.3 -4.2 0.123 0.267 0.228 0.547 0.071 2006-07
## 4854 22.6 10.7 2.5 4.1 0.082 0.245 0.268 0.577 0.115 2006-07
## 4888 6.4 10.7 2.4 3.9 0.127 0.220 0.107 0.456 0.101 2006-07
## 4763 20.0 10.6 3.4 13.7 0.100 0.268 0.279 0.579 0.182 2006-07
We are taking average rebounds of the players as the response variable
response_variable <- season_2006_07$reb
unique_levels <- unique(season_2006_07$team_abbreviation)
print(unique_levels)
## [1] "SEA" "IND" "NYK" "DEN" "ORL" "PHX" "WAS" "POR" "MIA" "SAS" "NOK" "MIL"
## [13] "UTA" "LAC" "CHA" "HOU" "DAL" "MEM" "NJN" "CLE" "PHI" "ATL" "GSW" "TOR"
## [25] "SAC" "BOS" "MIN" "LAL" "DET" "CHI"
season_2006_07$team_abbreviation <- as.factor(season_2006_07$team_abbreviation)
season_2006_07 <- na.omit(season_2006_07)
Here the teams of the players is taken as the categorical variable on which we will create the hypothesis Null Hypothesis (H0): The mean rebounds scored by players are equal across all NBA teams.
Alternative Hypothesis (Ha): The mean rebounds scored by players are not equal across all NBA teams.
categorical_variable <- season_2006_07$team_abbreviation
categorical_variable <- ifelse(categorical_variable %in% top_players$team_abbreviation, categorical_variable, "Other")
anova_result <- aov(response_variable ~ as.factor(categorical_variable))
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(categorical_variable) 10 26.2 2.624 0.42 0.937
## Residuals 447 2794.6 6.252
The ANOVA test is run in this code to see if there are any notable variations in rebounding performance between teams. For an ANOVA, the null hypothesis (H0) states that there is no discernible difference in the group means (in this case, the teams). You would reject the null hypothesis, indicating that there is a significant difference between the groups, if the p-value from the ANOVA is less than the selected significance level, which is typically 0.05.
It is possible to conclude that there is sufficient evidence to reject the null hypothesis and suggest that rebound performance is influenced by team affiliation if the p-value is less than 0.05.
The p-value is greater than 0.05 hence we can reject the alternative hypothesis.
# Assuming you have a data frame named 'season_2006_07' containing the NBA players' data
# Let's consider 'Points' as the predictor variable and 'Rebounds' as the response variable
# First, let's visualize the relationship between 'Points' and 'Rebounds' using a scatter plot
plot(season_2006_07$pts, season_2006_07$reb, xlab = "Points", ylab = "Rebounds", main = "Points vs Rebounds")
# Fit a linear regression model
linear_model <- lm(reb ~ pts, data = season_2006_07)
# Print summary of the linear regression model
summary(linear_model)
##
## Call:
## lm(formula = reb ~ pts, data = season_2006_07)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.9352 -1.2957 -0.4270 0.8369 8.5806
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.49202 0.15001 9.946 <2e-16 ***
## pts 0.24499 0.01451 16.882 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.951 on 456 degrees of freedom
## Multiple R-squared: 0.3846, Adjusted R-squared: 0.3833
## F-statistic: 285 on 1 and 456 DF, p-value: < 2.2e-16
# Perform hypothesis test for the coefficient of 'Points'
# Null Hypothesis: There is no relationship between Points and Rebounds
# Alternative Hypothesis: There is a relationship between Points and Rebounds
anova_result <- anova(linear_model)
print(anova_result)
## Analysis of Variance Table
##
## Response: reb
## Df Sum Sq Mean Sq F value Pr(>F)
## pts 1 1085.0 1084.97 285.01 < 2.2e-16 ***
## Residuals 456 1735.9 3.81
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Diagnostic plots to check model assumptions
par(mfrow = c(2, 2))
plot(linear_model)
# Interpret the coefficients
coefficients(linear_model)
## (Intercept) pts
## 1.4920161 0.2449879
# Make predictions using the model
predicted_rebounds <- predict(linear_model, newdata = data.frame(pts = c(100, 150, 200)))
print(predicted_rebounds)
## 1 2 3
## 25.99081 38.24021 50.48960
# Assuming you have a data frame named 'season_2006_07' containing Points, Assists, and Rebounds columns
# Fit a multiple linear regression model with interaction term
lm_model <- lm(pts~ ast * reb, data = season_2006_07)
# Print the summary of the regression model
summary(lm_model)
##
## Call:
## lm(formula = pts ~ ast * reb, data = season_2006_07)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.9283 -1.8779 -0.4917 1.7875 17.2757
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.67827 0.42149 1.609 0.108
## ast 1.71132 0.20422 8.380 6.69e-16 ***
## reb 1.18431 0.11318 10.464 < 2e-16 ***
## ast:reb 0.05030 0.04914 1.024 0.307
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.624 on 454 degrees of freedom
## Multiple R-squared: 0.6701, Adjusted R-squared: 0.6679
## F-statistic: 307.4 on 3 and 454 DF, p-value: < 2.2e-16
# Check diagnostic plots
plot(lm_model)
# Interpretation:
# - Coefficients for Assists, Rebounds, and their interaction term represent the change in points for a one-unit change in each variable.
# - The interaction term helps capture the combined effect of Assists and Rebounds on Points, allowing for a non-additive relationship.