This analysis applies a Random Forest model to predict shot outcomes and compares its performance to the logistic regression model developed in Analysis I.
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(ggplot2)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
data <- read_excel("/Users/maxemelo/Downloads/hot_hand_shot_tracker_SPSS.xlsx")
data <- data %>%
rename(
Shot_Result = `Shot_Result\r\n(1=Make 0=Miss)`
)
data <- data %>%
arrange(Player_ID, Session_ID, Shot_Number) %>%
group_by(Player_ID, Session_ID) %>%
mutate(Previous_Shot = lag(Shot_Result)) %>%
ungroup() %>%
filter(!is.na(Previous_Shot))
set.seed(123)
TrainIndex <- createDataPartition(data$Shot_Result, p = 0.8, list = FALSE)
Train <- data[TrainIndex, ]
Test <- data[-TrainIndex, ]
model_rf <- randomForest(
as.factor(Shot_Result) ~ Previous_Shot + Shot_Location + Player_ID + Shot_Number,
data = Train,
ntree = 500,
importance = TRUE
)
rf_preds <- predict(model_rf, newdata = Test)
rf_accuracy <- mean(rf_preds == as.factor(Test$Shot_Result))
rf_accuracy
## [1] 0.5612245
confusionMatrix(rf_preds, as.factor(Test$Shot_Result))
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 62 55
## 1 31 48
##
## Accuracy : 0.5612
## 95% CI : (0.4887, 0.6318)
## No Information Rate : 0.5255
## P-Value [Acc > NIR] : 0.17630
##
## Kappa : 0.131
##
## Mcnemar's Test P-Value : 0.01313
##
## Sensitivity : 0.6667
## Specificity : 0.4660
## Pos Pred Value : 0.5299
## Neg Pred Value : 0.6076
## Prevalence : 0.4745
## Detection Rate : 0.3163
## Detection Prevalence : 0.5969
## Balanced Accuracy : 0.5663
##
## 'Positive' Class : 0
##
rf_probs <- predict(model_rf, newdata = Test, type = "prob")[,2]
roc_rf <- roc(Test$Shot_Result, rf_probs)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_rf)
## Area under the curve: 0.6052
plot(roc_rf)
varImpPlot(model_rf)
logit_accuracy <- 0.5663
model_results <- data.frame(
Model = c("Logistic Regression", "Random Forest"),
Accuracy = c(logit_accuracy, rf_accuracy)
)
ggplot(model_results, aes(x = Model, y = Accuracy)) +
geom_bar(stat = "identity", fill = "steelblue") +
geom_text(aes(label = round(Accuracy, 3)), vjust = -0.5) +
labs(
title = "Model Accuracy Comparison",
x = "Model",
y = "Accuracy"
) +
theme_minimal()
ggplot(data, aes(x = Shot_Location, y = Shot_Result)) +
stat_summary(fun = mean, geom = "bar") +
labs(
title = "Shooting Percentage by Location",
x = "Shot Location",
y = "Field Goal Percentage"
) +
theme_minimal()