Hot Hand Fallacy Analysis

This analysis tests whether making a shot increases the probability of making the next shot using practice data from Fairfield Club Basketball.

Step 1: Load Data and Packages

library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(caret)
## Loading required package: lattice
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
data <- read_excel("/Users/maxemelo/Downloads/hot_hand_shot_tracker_SPSS.xlsx")

head(data)
## # A tibble: 6 × 9
##   Player_ID Player_Name Practice_Date       Session_ID Shot_Number
##       <dbl> <chr>       <dttm>                   <dbl>       <dbl>
## 1         1 Dylan       2026-03-18 00:00:00          1           1
## 2         1 Dylan       2026-03-18 00:00:00          1           2
## 3         1 Dylan       2026-03-18 00:00:00          1           3
## 4         1 Dylan       2026-03-18 00:00:00          1           4
## 5         1 Dylan       2026-03-18 00:00:00          1           5
## 6         1 Dylan       2026-03-18 00:00:00          1           6
## # ℹ 4 more variables: `Shot_Result\r\n(1=Make 0=Miss)` <dbl>,
## #   Shot_Location <chr>, Previous_Shot <dbl>, Current_Streak <dbl>
names(data)
## [1] "Player_ID"                      "Player_Name"                   
## [3] "Practice_Date"                  "Session_ID"                    
## [5] "Shot_Number"                    "Shot_Result\r\n(1=Make 0=Miss)"
## [7] "Shot_Location"                  "Previous_Shot"                 
## [9] "Current_Streak"
dim(data)
## [1] 1000    9

Step 2: Clean Data

data <- data %>%
  rename(
    Shot_Result = `Shot_Result\r\n(1=Make 0=Miss)`
  )

data <- data %>%
  arrange(Player_ID, Session_ID, Shot_Number) %>%
  group_by(Player_ID, Session_ID) %>%
  mutate(Previous_Shot = lag(Shot_Result)) %>%
  ungroup()

data <- data %>%
  filter(!is.na(Previous_Shot))

head(data)
## # A tibble: 6 × 9
##   Player_ID Player_Name Practice_Date       Session_ID Shot_Number Shot_Result
##       <dbl> <chr>       <dttm>                   <dbl>       <dbl>       <dbl>
## 1         1 Dylan       2026-03-18 00:00:00          1           2           1
## 2         1 Dylan       2026-03-18 00:00:00          1           3           0
## 3         1 Dylan       2026-03-18 00:00:00          1           4           0
## 4         1 Dylan       2026-03-18 00:00:00          1           5           1
## 5         1 Dylan       2026-03-18 00:00:00          1           6           1
## 6         1 Dylan       2026-03-18 00:00:00          1           7           1
## # ℹ 3 more variables: Shot_Location <chr>, Previous_Shot <dbl>,
## #   Current_Streak <dbl>
dim(data)
## [1] 980   9

Step 3: Chi-Square Analysis

The chi-square test evaluates whether consecutive shot outcomes are independent.

table(data$Shot_Result)
## 
##   0   1 
## 513 467
table(data$Previous_Shot)
## 
##   0   1 
## 514 466
hot_hand_table <- table(data$Previous_Shot, data$Shot_Result)
hot_hand_table
##    
##       0   1
##   0 273 241
##   1 240 226
chisq.test(hot_hand_table)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  hot_hand_table
## X-squared = 0.19373, df = 1, p-value = 0.6598

Our findings show that there is no significant link between the previous shot result and the next shot result, supporting the hot hand fallacy.

Step 4: Logistic Regression

The logistic regression model tests whether previous shot outcome predicts the current shot while controlling for player, shot location, and shot number.

model_logit <- glm(
  Shot_Result ~ Previous_Shot + Shot_Location + Player_ID + Shot_Number,
  data = data,
  family = binomial
)

summary(model_logit)
## 
## Call:
## glm(formula = Shot_Result ~ Previous_Shot + Shot_Location + Player_ID + 
##     Shot_Number, family = binomial, data = data)
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)                0.5268752  0.2338695   2.253  0.02427 * 
## Previous_Shot             -0.0385079  0.1314159  -0.293  0.76950   
## Shot_LocationLeft Elbow   -0.4239362  0.2033440  -2.085  0.03709 * 
## Shot_LocationRight Corner -0.4409313  0.2094248  -2.105  0.03525 * 
## Shot_LocationRight Elbow  -0.2009612  0.2027851  -0.991  0.32168   
## Shot_LocationTop of Key   -0.5808184  0.2045905  -2.839  0.00453 **
## Player_ID                 -0.1693231  0.0584820  -2.895  0.00379 **
## Shot_Number                0.0008936  0.0003813   2.343  0.01912 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1356.4  on 979  degrees of freedom
## Residual deviance: 1331.3  on 972  degrees of freedom
## AIC: 1347.3
## 
## Number of Fisher Scoring iterations: 4

The logistic regression results align with the chi-square test. Previous shot was not statistically significant, while player and shot location had a stronger influence on shot outcome.

Step 5: Train-Test Split

Next, we created random training and test datasets.

set.seed(123)

trainIndex <- createDataPartition(data$Shot_Result, p = 0.8, list = FALSE)
train <- data[trainIndex, ]
test <- data[-trainIndex, ]

Step 6: Train Model and Generate Predictions

model_logit_train <- glm(
  Shot_Result ~ Previous_Shot + Shot_Location + Player_ID + Shot_Number,
  data = train,
  family = binomial
)

# Predict probabilities
pred_probs <- predict(model_logit_train, newdata = test, type = "response")

# Convert to 0/1 predictions
pred_class <- ifelse(pred_probs > 0.5, 1, 0)

Step 7: Model Performance

accuracy <- mean(pred_class == test$Shot_Result)
accuracy
## [1] 0.5663265
confusionMatrix(as.factor(pred_class), as.factor(test$Shot_Result))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 75 67
##          1 18 36
##                                           
##                Accuracy : 0.5663          
##                  95% CI : (0.4938, 0.6368)
##     No Information Rate : 0.5255          
##     P-Value [Acc > NIR] : 0.1416          
##                                           
##                   Kappa : 0.1521          
##                                           
##  Mcnemar's Test P-Value : 1.926e-07       
##                                           
##             Sensitivity : 0.8065          
##             Specificity : 0.3495          
##          Pos Pred Value : 0.5282          
##          Neg Pred Value : 0.6667          
##              Prevalence : 0.4745          
##          Detection Rate : 0.3827          
##    Detection Prevalence : 0.7245          
##       Balanced Accuracy : 0.5780          
##                                           
##        'Positive' Class : 0               
## 

Step 8: AUC

roc_obj <- roc(test$Shot_Result, pred_probs)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_obj)
## Area under the curve: 0.6201
plot(roc_obj)

Chart 1: Hot Hand Depiction

ggplot(data, aes(x = factor(Previous_Shot), fill = factor(Shot_Result))) +
  geom_bar(position = "fill") +
  labs(
    title = "Probability of Making a Shot Given Previous Shot",
    x = "Previous Shot (0 = Miss, 1 = Make)",
    y = "Proportion",
    fill = "Shot Result"
  ) +
  scale_fill_manual(values = c("tomato", "steelblue"))

Chart 2: Shooting Percentage by Location

ggplot(data, aes(x = Shot_Location, y = Shot_Result)) +
  stat_summary(fun = mean, geom = "bar") +
  labs(
    title = "Shooting Percentage by Location",
    x = "Shot Location",
    y = "Field Goal Percentage"
  ) +
  theme_minimal()

Chart 3: Shooting Percentage by Player

ggplot(data, aes(x = Player_Name, y = Shot_Result)) +
  stat_summary(fun = mean, geom = "bar") +
  labs(
    title = "Shooting Percentage by Player",
    x = "Player Name",
    y = "Field Goal Percentage"
  ) +
  theme_minimal()