Overview

This analysis applies a Random Forest model to predict shot outcomes and compares its performance to the logistic regression model developed in Analysis I.

Step 1: Load Data

library(readxl)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(caret)

## Loading required package: ggplot2

## Loading required package: lattice

library(randomForest)

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

## The following object is masked from 'package:dplyr':
## 
##     combine

library(ggplot2)
library(pROC)

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

data <- read_excel("/Users/maxemelo/Downloads/hot_hand_shot_tracker_SPSS.xlsx")

Step 2: Clean Data

data <- data %>%
  rename(
    Shot_Result = `Shot_Result\r\n(1=Make 0=Miss)`
  )

data <- data %>%
  arrange(Player_ID, Session_ID, Shot_Number) %>%
  group_by(Player_ID, Session_ID) %>%
  mutate(Previous_Shot = lag(Shot_Result)) %>%
  ungroup() %>%
  filter(!is.na(Previous_Shot))

Step 3: Train-Test Split

set.seed(123)

TrainIndex <- createDataPartition(data$Shot_Result, p = 0.8, list = FALSE)
Train <- data[TrainIndex, ]
Test <- data[-TrainIndex, ]

Step 4: Random Forest Model

model_rf <- randomForest(
  as.factor(Shot_Result) ~ Previous_Shot + Shot_Location + Player_ID + Shot_Number,
  data = Train,
  ntree = 500,
  importance = TRUE
)

Step 5: Predictions & Accuracy

rf_preds <- predict(model_rf, newdata = Test)
rf_accuracy <- mean(rf_preds == as.factor(Test$Shot_Result))
rf_accuracy

## [1] 0.5612245

Step 6: Confusion Matrix

confusionMatrix(rf_preds, as.factor(Test$Shot_Result))

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  0  1
##          0 62 55
##          1 31 48
##                                           
##                Accuracy : 0.5612          
##                  95% CI : (0.4887, 0.6318)
##     No Information Rate : 0.5255          
##     P-Value [Acc > NIR] : 0.17630         
##                                           
##                   Kappa : 0.131           
##                                           
##  Mcnemar's Test P-Value : 0.01313         
##                                           
##             Sensitivity : 0.6667          
##             Specificity : 0.4660          
##          Pos Pred Value : 0.5299          
##          Neg Pred Value : 0.6076          
##              Prevalence : 0.4745          
##          Detection Rate : 0.3163          
##    Detection Prevalence : 0.5969          
##       Balanced Accuracy : 0.5663          
##                                           
##        'Positive' Class : 0               
##

Step 7: ROC Curve & AUC

rf_probs <- predict(model_rf, newdata = Test, type = "prob")[,2]

roc_rf <- roc(Test$Shot_Result, rf_probs)

## Setting levels: control = 0, case = 1

## Setting direction: controls < cases

auc(roc_rf)

## Area under the curve: 0.6052

plot(roc_rf)

Step 8: Feature Importance

varImpPlot(model_rf)

Step 9: Model Comparison

logit_accuracy <- 0.5663

model_results <- data.frame(
  Model = c("Logistic Regression", "Random Forest"),
  Accuracy = c(logit_accuracy, rf_accuracy)
)

ggplot(model_results, aes(x = Model, y = Accuracy)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  geom_text(aes(label = round(Accuracy, 3)), vjust = -0.5) +
  labs(
    title = "Model Accuracy Comparison",
    x = "Model",
    y = "Accuracy"
  ) +
  theme_minimal()

Step 10: Shooting % by Location

ggplot(data, aes(x = Shot_Location, y = Shot_Result)) +
  stat_summary(fun = mean, geom = "bar") +
  labs(
    title = "Shooting Percentage by Location",
    x = "Shot Location",
    y = "Field Goal Percentage"
  ) +
  theme_minimal()

Checkpoint 11: Analysis II (Random Forest)

Max Emelo & Joe Scandalios

2026-04-22

Overview

Step 1: Load Data

Step 2: Clean Data

Step 3: Train-Test Split

Step 4: Random Forest Model

Step 5: Predictions & Accuracy

Step 6: Confusion Matrix

Step 7: ROC Curve & AUC

Step 8: Feature Importance

Step 9: Model Comparison

Step 10: Shooting % by Location