library(tidyverse)
library(kableExtra)
library(corrplot)
library(broom)
basketball <- read.csv('basketball.csv', sep = ',')
colnames(basketball)<- c("height", "weight", "success_goals", "free_throws", "avg_scores")
dim(basketball)
## [1] 54 5
kable(basketball) %>%
kable_styling() %>%
scroll_box(width = "700px", height = "400px")
height | weight | success_goals | free_throws | avg_scores |
---|---|---|---|---|
6.8 | 225 | 0.442 | 0.672 | 9.2 |
6.3 | 180 | 0.435 | 0.797 | 11.7 |
6.4 | 190 | 0.456 | 0.761 | 15.8 |
6.2 | 180 | 0.416 | 0.651 | 8.6 |
6.9 | 205 | 0.449 | 0.900 | 23.2 |
6.4 | 225 | 0.431 | 0.780 | 27.4 |
6.3 | 185 | 0.487 | 0.771 | 9.3 |
6.8 | 235 | 0.469 | 0.750 | 16.0 |
6.9 | 235 | 0.435 | 0.818 | 4.7 |
6.7 | 210 | 0.480 | 0.825 | 12.5 |
6.9 | 245 | 0.516 | 0.632 | 20.1 |
6.9 | 245 | 0.493 | 0.757 | 9.1 |
6.3 | 185 | 0.374 | 0.709 | 8.1 |
6.1 | 185 | 0.424 | 0.782 | 8.6 |
6.2 | 180 | 0.441 | 0.775 | 20.3 |
6.8 | 220 | 0.503 | 0.880 | 25.0 |
6.5 | 194 | 0.503 | 0.833 | 19.2 |
7.6 | 225 | 0.425 | 0.571 | 3.3 |
6.3 | 210 | 0.371 | 0.816 | 11.2 |
7.1 | 240 | 0.504 | 0.714 | 10.5 |
6.8 | 225 | 0.400 | 0.765 | 10.1 |
7.3 | 263 | 0.482 | 0.655 | 7.2 |
6.4 | 210 | 0.475 | 0.244 | 13.6 |
6.8 | 235 | 0.428 | 0.728 | 9.0 |
7.2 | 230 | 0.559 | 0.721 | 24.6 |
6.4 | 190 | 0.441 | 0.757 | 12.6 |
6.6 | 220 | 0.492 | 0.747 | 5.6 |
6.8 | 210 | 0.402 | 0.739 | 8.7 |
6.1 | 180 | 0.415 | 0.713 | 7.7 |
6.5 | 235 | 0.492 | 0.742 | 24.1 |
6.4 | 185 | 0.484 | 0.861 | 11.7 |
6.0 | 175 | 0.387 | 0.721 | 7.7 |
6.0 | 192 | 0.436 | 0.785 | 9.6 |
7.3 | 263 | 0.482 | 0.655 | 7.2 |
6.1 | 180 | 0.340 | 0.821 | 12.3 |
6.7 | 240 | 0.516 | 0.728 | 8.9 |
6.4 | 210 | 0.475 | 0.846 | 13.6 |
5.8 | 160 | 0.412 | 0.813 | 11.2 |
6.9 | 230 | 0.411 | 0.595 | 2.8 |
7.0 | 245 | 0.407 | 0.573 | 3.2 |
7.3 | 228 | 0.445 | 0.726 | 9.4 |
5.9 | 155 | 0.291 | 0.707 | 11.9 |
6.2 | 200 | 0.449 | 0.804 | 15.4 |
6.8 | 235 | 0.546 | 0.784 | 7.4 |
7.0 | 235 | 0.480 | 0.744 | 18.9 |
5.9 | 105 | 0.359 | 0.839 | 7.9 |
6.1 | 180 | 0.528 | 0.790 | 12.2 |
5.7 | 185 | 0.352 | 0.701 | 11.0 |
7.1 | 245 | 0.414 | 0.778 | 2.8 |
5.8 | 180 | 0.425 | 0.872 | 11.8 |
7.4 | 240 | 0.599 | 0.713 | 17.1 |
6.8 | 225 | 0.482 | 0.701 | 11.6 |
6.8 | 215 | 0.457 | 0.734 | 5.8 |
7.0 | 230 | 0.435 | 0.764 | 8.3 |
# adding an extra column with dichotomous term
basketball$success <- ifelse(basketball$success_goals >=0.5, 1, 0)
basketball <- basketball[, c(1,2,3,6,4,5)] #re-arrange columns
#Quadratic
basketball$success_goals <- basketball$success_goals^2
#dichotomous vs. quantitative
basketball$dvsq <- round(basketball$height * basketball$success, 2)
dim(basketball)
## [1] 54 7
kable(basketball) %>%
kable_styling() %>%
scroll_box(width = "700px", height = "400px")
height | weight | success_goals | success | free_throws | avg_scores | dvsq |
---|---|---|---|---|---|---|
6.8 | 225 | 0.195364 | 0 | 0.672 | 9.2 | 0.0 |
6.3 | 180 | 0.189225 | 0 | 0.797 | 11.7 | 0.0 |
6.4 | 190 | 0.207936 | 0 | 0.761 | 15.8 | 0.0 |
6.2 | 180 | 0.173056 | 0 | 0.651 | 8.6 | 0.0 |
6.9 | 205 | 0.201601 | 0 | 0.900 | 23.2 | 0.0 |
6.4 | 225 | 0.185761 | 0 | 0.780 | 27.4 | 0.0 |
6.3 | 185 | 0.237169 | 0 | 0.771 | 9.3 | 0.0 |
6.8 | 235 | 0.219961 | 0 | 0.750 | 16.0 | 0.0 |
6.9 | 235 | 0.189225 | 0 | 0.818 | 4.7 | 0.0 |
6.7 | 210 | 0.230400 | 0 | 0.825 | 12.5 | 0.0 |
6.9 | 245 | 0.266256 | 1 | 0.632 | 20.1 | 6.9 |
6.9 | 245 | 0.243049 | 0 | 0.757 | 9.1 | 0.0 |
6.3 | 185 | 0.139876 | 0 | 0.709 | 8.1 | 0.0 |
6.1 | 185 | 0.179776 | 0 | 0.782 | 8.6 | 0.0 |
6.2 | 180 | 0.194481 | 0 | 0.775 | 20.3 | 0.0 |
6.8 | 220 | 0.253009 | 1 | 0.880 | 25.0 | 6.8 |
6.5 | 194 | 0.253009 | 1 | 0.833 | 19.2 | 6.5 |
7.6 | 225 | 0.180625 | 0 | 0.571 | 3.3 | 0.0 |
6.3 | 210 | 0.137641 | 0 | 0.816 | 11.2 | 0.0 |
7.1 | 240 | 0.254016 | 1 | 0.714 | 10.5 | 7.1 |
6.8 | 225 | 0.160000 | 0 | 0.765 | 10.1 | 0.0 |
7.3 | 263 | 0.232324 | 0 | 0.655 | 7.2 | 0.0 |
6.4 | 210 | 0.225625 | 0 | 0.244 | 13.6 | 0.0 |
6.8 | 235 | 0.183184 | 0 | 0.728 | 9.0 | 0.0 |
7.2 | 230 | 0.312481 | 1 | 0.721 | 24.6 | 7.2 |
6.4 | 190 | 0.194481 | 0 | 0.757 | 12.6 | 0.0 |
6.6 | 220 | 0.242064 | 0 | 0.747 | 5.6 | 0.0 |
6.8 | 210 | 0.161604 | 0 | 0.739 | 8.7 | 0.0 |
6.1 | 180 | 0.172225 | 0 | 0.713 | 7.7 | 0.0 |
6.5 | 235 | 0.242064 | 0 | 0.742 | 24.1 | 0.0 |
6.4 | 185 | 0.234256 | 0 | 0.861 | 11.7 | 0.0 |
6.0 | 175 | 0.149769 | 0 | 0.721 | 7.7 | 0.0 |
6.0 | 192 | 0.190096 | 0 | 0.785 | 9.6 | 0.0 |
7.3 | 263 | 0.232324 | 0 | 0.655 | 7.2 | 0.0 |
6.1 | 180 | 0.115600 | 0 | 0.821 | 12.3 | 0.0 |
6.7 | 240 | 0.266256 | 1 | 0.728 | 8.9 | 6.7 |
6.4 | 210 | 0.225625 | 0 | 0.846 | 13.6 | 0.0 |
5.8 | 160 | 0.169744 | 0 | 0.813 | 11.2 | 0.0 |
6.9 | 230 | 0.168921 | 0 | 0.595 | 2.8 | 0.0 |
7.0 | 245 | 0.165649 | 0 | 0.573 | 3.2 | 0.0 |
7.3 | 228 | 0.198025 | 0 | 0.726 | 9.4 | 0.0 |
5.9 | 155 | 0.084681 | 0 | 0.707 | 11.9 | 0.0 |
6.2 | 200 | 0.201601 | 0 | 0.804 | 15.4 | 0.0 |
6.8 | 235 | 0.298116 | 1 | 0.784 | 7.4 | 6.8 |
7.0 | 235 | 0.230400 | 0 | 0.744 | 18.9 | 0.0 |
5.9 | 105 | 0.128881 | 0 | 0.839 | 7.9 | 0.0 |
6.1 | 180 | 0.278784 | 1 | 0.790 | 12.2 | 6.1 |
5.7 | 185 | 0.123904 | 0 | 0.701 | 11.0 | 0.0 |
7.1 | 245 | 0.171396 | 0 | 0.778 | 2.8 | 0.0 |
5.8 | 180 | 0.180625 | 0 | 0.872 | 11.8 | 0.0 |
7.4 | 240 | 0.358801 | 1 | 0.713 | 17.1 | 7.4 |
6.8 | 225 | 0.232324 | 0 | 0.701 | 11.6 | 0.0 |
6.8 | 215 | 0.208849 | 0 | 0.734 | 5.8 | 0.0 |
7.0 | 230 | 0.189225 | 0 | 0.764 | 8.3 | 0.0 |
After including the said terms in the dataframe I took a closer look at dvsq and noticed something.
df <- subset(basketball, basketball$success==1)
kable(df) %>%
kable_styling() %>%
scroll_box(width = "700px", height = "400px")
height | weight | success_goals | success | free_throws | avg_scores | dvsq | |
---|---|---|---|---|---|---|---|
11 | 6.9 | 245 | 0.266256 | 1 | 0.632 | 20.1 | 6.9 |
16 | 6.8 | 220 | 0.253009 | 1 | 0.880 | 25.0 | 6.8 |
17 | 6.5 | 194 | 0.253009 | 1 | 0.833 | 19.2 | 6.5 |
20 | 7.1 | 240 | 0.254016 | 1 | 0.714 | 10.5 | 7.1 |
25 | 7.2 | 230 | 0.312481 | 1 | 0.721 | 24.6 | 7.2 |
36 | 6.7 | 240 | 0.266256 | 1 | 0.728 | 8.9 | 6.7 |
44 | 6.8 | 235 | 0.298116 | 1 | 0.784 | 7.4 | 6.8 |
47 | 6.1 | 180 | 0.278784 | 1 | 0.790 | 12.2 | 6.1 |
51 | 7.4 | 240 | 0.358801 | 1 | 0.713 | 17.1 | 7.4 |
Players who are taller than 6ft
with matched weight have higher success in scoring more goals. Let’s see what the model will tell us.
par(mfrow=c(1,2))
b_cor <- cor(basketball)
corrplot(b_cor, method = "circle")
corrplot(b_cor, method = "number")
pairs(basketball, pch=21)
By looking at the above plots we can see relationships between the variables. For instance, with height
and weight
, we can observe that there is somewhat a strong relationship between them while with success_goals
and avg_scores
there seems to be a very weak relationship.
basketball_lm <- lm(avg_scores ~ height + weight + success_goals + success + free_throws + dvsq, data = basketball)
summary(basketball_lm)
##
## Call:
## lm(formula = avg_scores ~ height + weight + success_goals + success +
## free_throws + dvsq, data = basketball)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.8629 -3.3563 -0.8627 1.8945 15.0850
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 21.37380 15.52754 1.377 0.1752
## height -4.84320 3.03984 -1.593 0.1178
## weight 0.02133 0.04615 0.462 0.6462
## success_goals 42.16688 23.25897 1.813 0.0762 .
## success -46.68407 35.62309 -1.311 0.1964
## free_throws 11.93155 7.81288 1.527 0.1334
## dvsq 7.14273 5.24398 1.362 0.1797
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.364 on 47 degrees of freedom
## Multiple R-squared: 0.2668, Adjusted R-squared: 0.1731
## F-statistic: 2.85 on 6 and 47 DF, p-value: 0.01891
In creating this model, we want to keep the predictors with a p-value that does not exeed the threshold of 0.05
. We can see that weight
has a p-value of 0.6462
which encourages us to remove that variable from the model. Weight is not that significant after all.
Removing weight
…
basketball_lm <- update(basketball_lm, .~. - weight, data = basketball)
summary(basketball_lm)
##
## Call:
## lm(formula = avg_scores ~ height + success_goals + success +
## free_throws + dvsq, data = basketball)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.6431 -3.6178 -0.9641 2.0449 15.6501
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.830 14.399 1.308 0.1972
## height -3.786 1.985 -1.908 0.0624 .
## success_goals 44.743 22.395 1.998 0.0514 .
## success -45.257 35.197 -1.286 0.2047
## free_throws 11.331 7.640 1.483 0.1446
## dvsq 6.911 5.177 1.335 0.1882
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.32 on 48 degrees of freedom
## Multiple R-squared: 0.2634, Adjusted R-squared: 0.1867
## F-statistic: 3.433 on 5 and 48 DF, p-value: 0.009874
Removing success
with 0.2047
basketball_lm <- update(basketball_lm, .~. - success, data = basketball)
summary(basketball_lm)
##
## Call:
## lm(formula = avg_scores ~ height + success_goals + free_throws +
## dvsq, data = basketball)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.9480 -3.3230 -0.6853 2.4111 15.7896
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.6685 14.1241 1.039 0.3041
## height -3.1059 1.9255 -1.613 0.1132
## success_goals 45.1652 22.5416 2.004 0.0507 .
## free_throws 10.8074 7.6803 1.407 0.1657
## dvsq 0.2739 0.4020 0.681 0.4990
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.356 on 49 degrees of freedom
## Multiple R-squared: 0.2381, Adjusted R-squared: 0.1759
## F-statistic: 3.827 on 4 and 49 DF, p-value: 0.008745
Removing dvsq
with p_value of 0.4990
basketball_lm <- update(basketball_lm, .~. - dvsq, data = basketball)
summary(basketball_lm)
##
## Call:
## lm(formula = avg_scores ~ height + success_goals + free_throws,
## data = basketball)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.3412 -3.4562 -0.9371 2.7112 15.6373
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.545 13.952 0.971 0.33630
## height -3.242 1.905 -1.702 0.09501 .
## success_goals 55.456 16.640 3.333 0.00162 **
## free_throws 11.106 7.627 1.456 0.15159
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.327 on 50 degrees of freedom
## Multiple R-squared: 0.2308, Adjusted R-squared: 0.1847
## F-statistic: 5.002 on 3 and 50 DF, p-value: 0.004126
Removing free_throws
with p_value of 0.15463
basketball_lm <- update(basketball_lm, .~. - free_throws, data = basketball)
summary(basketball_lm)
##
## Call:
## lm(formula = avg_scores ~ height + success_goals, data = basketball)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.9911 -3.5787 -0.9251 2.2237 15.9712
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 26.384 10.932 2.414 0.019428 *
## height -4.036 1.845 -2.187 0.033333 *
## success_goals 58.534 16.686 3.508 0.000952 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.385 on 51 degrees of freedom
## Multiple R-squared: 0.1982, Adjusted R-squared: 0.1668
## F-statistic: 6.304 on 2 and 51 DF, p-value: 0.003577
All of the predictors have a p_value below the threshold which brings us to our final model:
\[avg\_scores = 26.384 - (4.036 \times height) + (58.534 \times success_goals)\]
Since this is a Multiple Linear Regression model, we’ll look at the \(Adjusted \ R^2\) since it removes any noise or bias.
Notice with each update, the \(Adjusted \ R^2\) value decreased as well as the p_value
but the F-statistic
increased. This indicates model doesn’t explain much of variation of the data but it is still significant due to the p_value being 0.003577
. It is better than not having a model at all.
basketballlm_df <- augment(basketball_lm)
ggplot(basketballlm_df, aes(x = .fitted, y = .resid)) + geom_point() + geom_hline(yintercept=0, color = 'brown', size = 1) + ggtitle('Residual vs Fitted')
The residuals are randomly plotted above and below the residual line which shows that the model could be a good fit.
ggplot(basketballlm_df, aes(x=.std.resid)) + geom_histogram(aes(y=..density..), bins = 10, colour="black")+
geom_density(alpha=.2, fill="green") + ggtitle('Histogram of Residuals')
Based on the histogram, most of the residuals are right skewed.
qplot(sample =.std.resid, data = basketballlm_df) + geom_abline()+ ggtitle('Normal Q-Q Plot')
We have to be a little cautious here. There’s a pattern here which confirms what we saw on the histogram with the residuals being right skewed, however we would not reject the model.
It does make sense for when predicting average scores, it may be based on height and how often the play scores a goal. Based on the p value our model is significant, however based on the \(R^2\), there may be other variables that may contribute to average scores a player gets.