Question 7 The Lottery

mean <- 50
std_dev <- 50
low_threshold <- 1
high_threshold <- 100
bonus <- 20
n_simulations <- 1e6
# Define the lottery rules
lottery_rules <- function(ticket_value) {
  if (ticket_value < low_threshold) {
    return(1)  # Pity prize
  } else if (ticket_value > high_threshold) {
    return(ticket_value + bonus)  # Value + bonus
  } else {
    return(ticket_value)  # Ticket value
  }
}
# Simulate single tickets
set.seed(123)
single_tickets <- rnorm(n_simulations, mean, std_dev)

# Apply lottery rules to single tickets
single_ticket_rewards <- sapply(single_tickets, lottery_rules)
single_ticket_ev <- mean(single_ticket_rewards)

cat("Expected value of a single ticket:", single_ticket_ev, "\n")
## Expected value of a single ticket: 57.47709
# Simulate two tickets for each entry
two_tickets <- matrix(rnorm(n_simulations * 2, mean, std_dev), ncol = 2)

# Apply lottery rules to each ticket and select the higher reward
two_ticket_rewards <- apply(two_tickets, c(1, 2), lottery_rules)

entry_rewards <- apply(two_ticket_rewards, 1, max)
entry_ev <- mean(entry_rewards)

cat("Expected value of an entry (higher of two tickets):", entry_ev, "\n")
## Expected value of an entry (higher of two tickets): 84.39025

#Question 11 OBP Projections

# Load the data
data <- read.csv("~/Desktop/obp.csv")


# Feature Engineering: Weighted OBP and Trend Calculation
# Calculate weighted OBP (weighted by plate appearances)
data <- data %>%
  mutate(
    Weighted_OBP = rowSums(cbind(
      OBP_16 * PA_16,
      OBP_17 * PA_17,
      OBP_18 * PA_18,
      OBP_19 * PA_19,
      OBP_20 * PA_20
    ), na.rm = TRUE) / rowSums(cbind(PA_16, PA_17, PA_18, PA_19, PA_20), na.rm = TRUE),
    Weighted_OBP = ifelse(is.nan(Weighted_OBP), 0, Weighted_OBP) # Handle division by zero
  )

# Calculate OBP trend (difference between consecutive years' OBPs)
data <- data %>%
  mutate(
    Trend_OBP = (OBP_20 - OBP_19) + (OBP_19 - OBP_18) + (OBP_18 - OBP_17) + (OBP_17 - OBP_16),
    Trend_OBP = Trend_OBP / 4 # Average trend over the years
  )

# Prepare data for modeling
features <- c("Weighted_OBP", "Trend_OBP", "PA_20")
target <- "OBP_21"

# Filter out rows with missing OBP_21
data <- data %>%
  filter(!is.na(OBP_21) & !is.na(Weighted_OBP) & !is.na(Trend_OBP)) %>%
  filter(PA_21 > 60) #Let's filter to make sure guys played at least 20+ games to avoid outliers

# Train-test split (70% train, 30% test)
set.seed(123)
train_index <- createDataPartition(data$OBP_21, p = 0.7, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]

# Train a Random Forest Model
rf_model <- randomForest(OBP_21 ~ Weighted_OBP + Trend_OBP + PA_20, data = train_data, importance = TRUE, ntree = 500)

# Make predictions for the entire dataset
data$Predicted_OBP_21 <- predict(rf_model, newdata = data)

# Calculate percentage difference
data <- data %>%
  mutate(
    Percent_Difference = abs(Predicted_OBP_21 - OBP_21) / OBP_21 * 100
  )


# Output the first few rows
head(data %>% select(Name, OBP_21, Predicted_OBP_21, Percent_Difference))
##              Name OBP_21 Predicted_OBP_21 Percent_Difference
## 1      Mike Trout  0.466        0.3635808          21.978362
## 2    Bryce Harper  0.429        0.3923495           8.543232
## 3    Byron Buxton  0.358        0.3237129           9.577412
## 4    Brandon Belt  0.378        0.3591275           4.992716
## 5 Yasmani Grandal  0.420        0.3767015          10.309167
## 6  Kyle Schwarber  0.374        0.3475265           7.078485
# Calculate the average percentage difference
average_difference <- mean(data$Percent_Difference, na.rm = TRUE)

# Print the average percentage difference
cat("Average Percentage Difference:", average_difference, "\n")
## Average Percentage Difference: 5.609072
ggplot(data, aes(x = OBP_21, y = Predicted_OBP_21)) +
  geom_point(alpha = 0.7) +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed", size = 1) +
  labs(
    title = "Actual vs. Predicted OBP (2021)",
    x = "Actual OBP (2021)",
    y = "Predicted OBP (2021)"
  ) +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

ggplot(data, aes(x = Percent_Difference)) +
  geom_histogram(bins = 30, fill = "blue", color = "black", alpha = 0.7) +
  geom_density(aes(y = ..density..), color = "red", linetype = "dashed") +
  labs(
    title = "Distribution of Percentage Differences",
    x = "Percentage Difference (%)",
    y = "Frequency"
  ) +
  theme_minimal()
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

importance <- as.data.frame(randomForest::importance(rf_model))
importance$Feature <- rownames(importance)

ggplot(importance, aes(x = reorder(Feature, IncNodePurity), y = IncNodePurity)) +
  geom_bar(stat = "identity", fill = "steelblue", alpha = 0.8) +
  coord_flip() +
  labs(
    title = "Feature Importance",
    x = "Feature",
    y = "Importance Score"
  ) +
  theme_minimal()

#Trying to determine if age is significant

# Add age as a feature
data <- data %>%
  mutate(Age_2021 = as.numeric(format(as.Date("2021-04-01"), "%Y")) - as.numeric(substr(birth_date, 1, 4)))

# Update the features list to include Age_2021
features <- c("Weighted_OBP", "Trend_OBP", "PA_20", "Age_2021")
target <- "OBP_21"

# Filter out rows with missing OBP_21
data <- data %>%
  filter(!is.na(OBP_21) & !is.na(Weighted_OBP) & !is.na(Trend_OBP)) %>%
  filter(PA_21 > 60) #Let's filter to make sure guys played at least 20+ games to avoid outliers

# Train-test split (70% train, 30% test)
set.seed(123)
train_index <- createDataPartition(data$OBP_21, p = 0.7, list = FALSE)
train_data <- data[train_index, ]
test_data <- data[-train_index, ]

# Train the model with the new feature
rf_model_age <- randomForest(OBP_21 ~ ., data = train_data[, c(features, target)], importance = TRUE, ntree = 500)

# Evaluate if Age_2021 improves the predictions
data$Age_Predicted_OBP_21 <- predict(rf_model_age, newdata = data)

# Calculate percentage difference
data <- data %>%
  mutate(
    Age_Percent_Difference = abs(Age_Predicted_OBP_21 - OBP_21) / OBP_21 * 100
  )


# Output the first few rows
head(data %>% select(Name, OBP_21, Age_Predicted_OBP_21, Age_Percent_Difference))
##              Name OBP_21 Age_Predicted_OBP_21 Age_Percent_Difference
## 1      Mike Trout  0.466            0.3557743              23.653593
## 2    Bryce Harper  0.429            0.3895861               9.187380
## 3    Byron Buxton  0.358            0.3221986              10.000398
## 4    Brandon Belt  0.378            0.3556991               5.899704
## 5 Yasmani Grandal  0.420            0.3737418              11.013866
## 6  Kyle Schwarber  0.374            0.3529915               5.617241
# Calculate the average percentage difference
average_age_difference <- mean(data$Age_Percent_Difference, na.rm = TRUE)

# Print the average percentage difference
cat("Average Percentage Difference:", average_age_difference, "\n")
## Average Percentage Difference: 5.647163
ggplot(data, aes(x = OBP_21, y = Age_Predicted_OBP_21)) +
  geom_point(alpha = 0.7) +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed", size = 1) +
  labs(
    title = "Actual vs. Predicted OBP (2021) with Age as Factor",
    x = "Actual OBP (2021)",
    y = "Predicted OBP (2021)"
  ) +
  theme_minimal()

ggplot(data, aes(x = Age_Percent_Difference)) +
  geom_histogram(bins = 30, fill = "blue", color = "black", alpha = 0.7) +
  geom_density(aes(y = ..density..), color = "red", linetype = "dashed") +
  labs(
    title = "Distribution of Percentage Differences",
    x = "Percentage Difference (%)",
    y = "Frequency"
  ) +
  theme_minimal()

importance <- as.data.frame(randomForest::importance(rf_model_age))
importance$Feature <- rownames(importance)

ggplot(importance, aes(x = reorder(Feature, IncNodePurity), y = IncNodePurity)) +
  geom_bar(stat = "identity", fill = "steelblue", alpha = 0.8) +
  coord_flip() +
  labs(
    title = "Feature Importance",
    x = "Feature",
    y = "Importance Score"
  ) +
  theme_minimal()

Age is not the most significant factor. The Trend in OBP over recent years is more significant so I will stick with the first model.