mean <- 50
std_dev <- 50
low_threshold <- 1
high_threshold <- 100
bonus <- 20
n_simulations <- 1e6
# Define the lottery rules
lottery_rules <- function(ticket_value) {
if (ticket_value < low_threshold) {
return(1) # Pity prize
} else if (ticket_value > high_threshold) {
return(ticket_value + bonus) # Value + bonus
} else {
return(ticket_value) # Ticket value
}
}
# Simulate single tickets
set.seed(123)
single_tickets <- rnorm(n_simulations, mean, std_dev)
# Apply lottery rules to single tickets
single_ticket_rewards <- sapply(single_tickets, lottery_rules)
single_ticket_ev <- mean(single_ticket_rewards)
cat("Expected value of a single ticket:", single_ticket_ev, "\n")
## Expected value of a single ticket: 57.47709
# Simulate two tickets for each entry
two_tickets <- matrix(rnorm(n_simulations * 2, mean, std_dev), ncol = 2)
# Apply lottery rules to each ticket and select the higher reward
two_ticket_rewards <- apply(two_tickets, c(1, 2), lottery_rules)
entry_rewards <- apply(two_ticket_rewards, 1, max)
entry_ev <- mean(entry_rewards)
cat("Expected value of an entry (higher of two tickets):", entry_ev, "\n")
## Expected value of an entry (higher of two tickets): 84.39025
#Question 11 OBP Projections
# Load the data
data <- read.csv("~/Desktop/obp.csv")
# Feature Engineering: Weighted OBP and Trend Calculation
# Calculate weighted OBP (weighted by plate appearances)
data <- data %>%
mutate(
Weighted_OBP = rowSums(cbind(
OBP_16 * PA_16,
OBP_17 * PA_17,
OBP_18 * PA_18,
OBP_19 * PA_19,
OBP_20 * PA_20
), na.rm = TRUE) / rowSums(cbind(PA_16, PA_17, PA_18, PA_19, PA_20), na.rm = TRUE),
Weighted_OBP = ifelse(is.nan(Weighted_OBP), 0, Weighted_OBP) # Handle division by zero
)
# Calculate OBP trend (difference between consecutive years' OBPs)
data <- data %>%
mutate(
Trend_OBP = (OBP_20 - OBP_19) + (OBP_19 - OBP_18) + (OBP_18 - OBP_17) + (OBP_17 - OBP_16),
Trend_OBP = Trend_OBP / 4 # Average trend over the years
)
# Prepare data for modeling
features <- c("Weighted_OBP", "Trend_OBP", "PA_20")
target <- "OBP_21"
# Filter out rows with missing OBP_21
data <- data %>%
filter(!is.na(OBP_21) & !is.na(Weighted_OBP) & !is.na(Trend_OBP)) %>%
filter(PA_21 > 60) #Let's filter to make sure guys played at least 20+ games to avoid outliers
# Train-test split (70% train, 30% test)
set.seed(123)
train_index <- createDataPartition(data$OBP_21, p = 0.7, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]
# Train a Random Forest Model
rf_model <- randomForest(OBP_21 ~ Weighted_OBP + Trend_OBP + PA_20, data = train_data, importance = TRUE, ntree = 500)
# Make predictions for the entire dataset
data$Predicted_OBP_21 <- predict(rf_model, newdata = data)
# Calculate percentage difference
data <- data %>%
mutate(
Percent_Difference = abs(Predicted_OBP_21 - OBP_21) / OBP_21 * 100
)
# Output the first few rows
head(data %>% select(Name, OBP_21, Predicted_OBP_21, Percent_Difference))
## Name OBP_21 Predicted_OBP_21 Percent_Difference
## 1 Mike Trout 0.466 0.3635808 21.978362
## 2 Bryce Harper 0.429 0.3923495 8.543232
## 3 Byron Buxton 0.358 0.3237129 9.577412
## 4 Brandon Belt 0.378 0.3591275 4.992716
## 5 Yasmani Grandal 0.420 0.3767015 10.309167
## 6 Kyle Schwarber 0.374 0.3475265 7.078485
# Calculate the average percentage difference
average_difference <- mean(data$Percent_Difference, na.rm = TRUE)
# Print the average percentage difference
cat("Average Percentage Difference:", average_difference, "\n")
## Average Percentage Difference: 5.609072
ggplot(data, aes(x = OBP_21, y = Predicted_OBP_21)) +
geom_point(alpha = 0.7) +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed", size = 1) +
labs(
title = "Actual vs. Predicted OBP (2021)",
x = "Actual OBP (2021)",
y = "Predicted OBP (2021)"
) +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
ggplot(data, aes(x = Percent_Difference)) +
geom_histogram(bins = 30, fill = "blue", color = "black", alpha = 0.7) +
geom_density(aes(y = ..density..), color = "red", linetype = "dashed") +
labs(
title = "Distribution of Percentage Differences",
x = "Percentage Difference (%)",
y = "Frequency"
) +
theme_minimal()
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
importance <- as.data.frame(randomForest::importance(rf_model))
importance$Feature <- rownames(importance)
ggplot(importance, aes(x = reorder(Feature, IncNodePurity), y = IncNodePurity)) +
geom_bar(stat = "identity", fill = "steelblue", alpha = 0.8) +
coord_flip() +
labs(
title = "Feature Importance",
x = "Feature",
y = "Importance Score"
) +
theme_minimal()
#Trying to determine if age is significant
# Add age as a feature
data <- data %>%
mutate(Age_2021 = as.numeric(format(as.Date("2021-04-01"), "%Y")) - as.numeric(substr(birth_date, 1, 4)))
# Update the features list to include Age_2021
features <- c("Weighted_OBP", "Trend_OBP", "PA_20", "Age_2021")
target <- "OBP_21"
# Filter out rows with missing OBP_21
data <- data %>%
filter(!is.na(OBP_21) & !is.na(Weighted_OBP) & !is.na(Trend_OBP)) %>%
filter(PA_21 > 60) #Let's filter to make sure guys played at least 20+ games to avoid outliers
# Train-test split (70% train, 30% test)
set.seed(123)
train_index <- createDataPartition(data$OBP_21, p = 0.7, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]
# Train the model with the new feature
rf_model_age <- randomForest(OBP_21 ~ ., data = train_data[, c(features, target)], importance = TRUE, ntree = 500)
# Evaluate if Age_2021 improves the predictions
data$Age_Predicted_OBP_21 <- predict(rf_model_age, newdata = data)
# Calculate percentage difference
data <- data %>%
mutate(
Age_Percent_Difference = abs(Age_Predicted_OBP_21 - OBP_21) / OBP_21 * 100
)
# Output the first few rows
head(data %>% select(Name, OBP_21, Age_Predicted_OBP_21, Age_Percent_Difference))
## Name OBP_21 Age_Predicted_OBP_21 Age_Percent_Difference
## 1 Mike Trout 0.466 0.3557743 23.653593
## 2 Bryce Harper 0.429 0.3895861 9.187380
## 3 Byron Buxton 0.358 0.3221986 10.000398
## 4 Brandon Belt 0.378 0.3556991 5.899704
## 5 Yasmani Grandal 0.420 0.3737418 11.013866
## 6 Kyle Schwarber 0.374 0.3529915 5.617241
# Calculate the average percentage difference
average_age_difference <- mean(data$Age_Percent_Difference, na.rm = TRUE)
# Print the average percentage difference
cat("Average Percentage Difference:", average_age_difference, "\n")
## Average Percentage Difference: 5.647163
ggplot(data, aes(x = OBP_21, y = Age_Predicted_OBP_21)) +
geom_point(alpha = 0.7) +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed", size = 1) +
labs(
title = "Actual vs. Predicted OBP (2021) with Age as Factor",
x = "Actual OBP (2021)",
y = "Predicted OBP (2021)"
) +
theme_minimal()
ggplot(data, aes(x = Age_Percent_Difference)) +
geom_histogram(bins = 30, fill = "blue", color = "black", alpha = 0.7) +
geom_density(aes(y = ..density..), color = "red", linetype = "dashed") +
labs(
title = "Distribution of Percentage Differences",
x = "Percentage Difference (%)",
y = "Frequency"
) +
theme_minimal()
importance <- as.data.frame(randomForest::importance(rf_model_age))
importance$Feature <- rownames(importance)
ggplot(importance, aes(x = reorder(Feature, IncNodePurity), y = IncNodePurity)) +
geom_bar(stat = "identity", fill = "steelblue", alpha = 0.8) +
coord_flip() +
labs(
title = "Feature Importance",
x = "Feature",
y = "Importance Score"
) +
theme_minimal()
Age is not the most significant factor. The Trend in OBP over recent years is more significant so I will stick with the first model.