This analysis tests whether making a shot increases the probability of making the next shot using practice data from Fairfield Club Basketball.
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(caret)
## Loading required package: lattice
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
data <- read_excel("/Users/maxemelo/Downloads/hot_hand_shot_tracker_SPSS.xlsx")
head(data)
## # A tibble: 6 × 9
## Player_ID Player_Name Practice_Date Session_ID Shot_Number
## <dbl> <chr> <dttm> <dbl> <dbl>
## 1 1 Dylan 2026-03-18 00:00:00 1 1
## 2 1 Dylan 2026-03-18 00:00:00 1 2
## 3 1 Dylan 2026-03-18 00:00:00 1 3
## 4 1 Dylan 2026-03-18 00:00:00 1 4
## 5 1 Dylan 2026-03-18 00:00:00 1 5
## 6 1 Dylan 2026-03-18 00:00:00 1 6
## # ℹ 4 more variables: `Shot_Result\r\n(1=Make 0=Miss)` <dbl>,
## # Shot_Location <chr>, Previous_Shot <dbl>, Current_Streak <dbl>
names(data)
## [1] "Player_ID" "Player_Name"
## [3] "Practice_Date" "Session_ID"
## [5] "Shot_Number" "Shot_Result\r\n(1=Make 0=Miss)"
## [7] "Shot_Location" "Previous_Shot"
## [9] "Current_Streak"
dim(data)
## [1] 1000 9
data <- data %>%
rename(
Shot_Result = `Shot_Result\r\n(1=Make 0=Miss)`
)
data <- data %>%
arrange(Player_ID, Session_ID, Shot_Number) %>%
group_by(Player_ID, Session_ID) %>%
mutate(Previous_Shot = lag(Shot_Result)) %>%
ungroup()
data <- data %>%
filter(!is.na(Previous_Shot))
head(data)
## # A tibble: 6 × 9
## Player_ID Player_Name Practice_Date Session_ID Shot_Number Shot_Result
## <dbl> <chr> <dttm> <dbl> <dbl> <dbl>
## 1 1 Dylan 2026-03-18 00:00:00 1 2 1
## 2 1 Dylan 2026-03-18 00:00:00 1 3 0
## 3 1 Dylan 2026-03-18 00:00:00 1 4 0
## 4 1 Dylan 2026-03-18 00:00:00 1 5 1
## 5 1 Dylan 2026-03-18 00:00:00 1 6 1
## 6 1 Dylan 2026-03-18 00:00:00 1 7 1
## # ℹ 3 more variables: Shot_Location <chr>, Previous_Shot <dbl>,
## # Current_Streak <dbl>
dim(data)
## [1] 980 9
The chi-square test evaluates whether consecutive shot outcomes are independent.
table(data$Shot_Result)
##
## 0 1
## 513 467
table(data$Previous_Shot)
##
## 0 1
## 514 466
hot_hand_table <- table(data$Previous_Shot, data$Shot_Result)
hot_hand_table
##
## 0 1
## 0 273 241
## 1 240 226
chisq.test(hot_hand_table)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: hot_hand_table
## X-squared = 0.19373, df = 1, p-value = 0.6598
Our findings show that there is no significant link between the previous shot result and the next shot result, supporting the hot hand fallacy.
The logistic regression model tests whether previous shot outcome predicts the current shot while controlling for player, shot location, and shot number.
model_logit <- glm(
Shot_Result ~ Previous_Shot + Shot_Location + Player_ID + Shot_Number,
data = data,
family = binomial
)
summary(model_logit)
##
## Call:
## glm(formula = Shot_Result ~ Previous_Shot + Shot_Location + Player_ID +
## Shot_Number, family = binomial, data = data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.5268752 0.2338695 2.253 0.02427 *
## Previous_Shot -0.0385079 0.1314159 -0.293 0.76950
## Shot_LocationLeft Elbow -0.4239362 0.2033440 -2.085 0.03709 *
## Shot_LocationRight Corner -0.4409313 0.2094248 -2.105 0.03525 *
## Shot_LocationRight Elbow -0.2009612 0.2027851 -0.991 0.32168
## Shot_LocationTop of Key -0.5808184 0.2045905 -2.839 0.00453 **
## Player_ID -0.1693231 0.0584820 -2.895 0.00379 **
## Shot_Number 0.0008936 0.0003813 2.343 0.01912 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1356.4 on 979 degrees of freedom
## Residual deviance: 1331.3 on 972 degrees of freedom
## AIC: 1347.3
##
## Number of Fisher Scoring iterations: 4
The logistic regression results align with the chi-square test. Previous shot was not statistically significant, while player and shot location had a stronger influence on shot outcome.
Next, we created random training and test datasets.
set.seed(123)
trainIndex <- createDataPartition(data$Shot_Result, p = 0.8, list = FALSE)
train <- data[trainIndex, ]
test <- data[-trainIndex, ]
model_logit_train <- glm(
Shot_Result ~ Previous_Shot + Shot_Location + Player_ID + Shot_Number,
data = train,
family = binomial
)
# Predict probabilities
pred_probs <- predict(model_logit_train, newdata = test, type = "response")
# Convert to 0/1 predictions
pred_class <- ifelse(pred_probs > 0.5, 1, 0)
accuracy <- mean(pred_class == test$Shot_Result)
accuracy
## [1] 0.5663265
confusionMatrix(as.factor(pred_class), as.factor(test$Shot_Result))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 75 67
## 1 18 36
##
## Accuracy : 0.5663
## 95% CI : (0.4938, 0.6368)
## No Information Rate : 0.5255
## P-Value [Acc > NIR] : 0.1416
##
## Kappa : 0.1521
##
## Mcnemar's Test P-Value : 1.926e-07
##
## Sensitivity : 0.8065
## Specificity : 0.3495
## Pos Pred Value : 0.5282
## Neg Pred Value : 0.6667
## Prevalence : 0.4745
## Detection Rate : 0.3827
## Detection Prevalence : 0.7245
## Balanced Accuracy : 0.5780
##
## 'Positive' Class : 0
##
roc_obj <- roc(test$Shot_Result, pred_probs)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_obj)
## Area under the curve: 0.6201
plot(roc_obj)
ggplot(data, aes(x = factor(Previous_Shot), fill = factor(Shot_Result))) +
geom_bar(position = "fill") +
labs(
title = "Probability of Making a Shot Given Previous Shot",
x = "Previous Shot (0 = Miss, 1 = Make)",
y = "Proportion",
fill = "Shot Result"
) +
scale_fill_manual(values = c("tomato", "steelblue"))
ggplot(data, aes(x = Shot_Location, y = Shot_Result)) +
stat_summary(fun = mean, geom = "bar") +
labs(
title = "Shooting Percentage by Location",
x = "Shot Location",
y = "Field Goal Percentage"
) +
theme_minimal()
ggplot(data, aes(x = Player_Name, y = Shot_Result)) +
stat_summary(fun = mean, geom = "bar") +
labs(
title = "Shooting Percentage by Player",
x = "Player Name",
y = "Field Goal Percentage"
) +
theme_minimal()