Introduction

Packages

  suppressMessages(library(dplyr))
  library(ggplot2)

Data Imports

#imports the data
salaries = read.csv("salary.csv", sep = ",", header = TRUE)
playoffs = read.csv("SeriesPost.csv", sep = ",", header = TRUE)

summary(salaries)
        year        team_id           league_id          player_id        
   Min.   :1985   Length:25575       Length:25575       Length:25575      
   1st Qu.:1993   Class :character   Class :character   Class :character  
   Median :2000   Mode  :character   Mode  :character   Mode  :character  
   Mean   :2000                                                           
   3rd Qu.:2008                                                           
   Max.   :2015                                                           
       salary        
   Min.   :       0  
   1st Qu.:  275000  
   Median :  550000  
   Mean   : 2008563  
   3rd Qu.: 2250000  
   Max.   :33000000
summary(playoffs)
       yearID        round           teamIDwinner        lgIDwinner       
   Min.   :1884   Length:307         Length:307         Length:307        
   1st Qu.:1970   Class :character   Class :character   Class :character  
   Median :1995   Mode  :character   Mode  :character   Mode  :character  
   Mean   :1981                                                           
   3rd Qu.:2006                                                           
   Max.   :2015                                                           
   teamIDloser         lgIDloser              wins           losses    
   Length:307         Length:307         Min.   : 1.00   Min.   :0.00  
   Class :character   Class :character   1st Qu.: 3.00   1st Qu.:0.00  
   Mode  :character   Mode  :character   Median : 4.00   Median :1.00  
                                         Mean   : 3.57   Mean   :1.42  
                                         3rd Qu.: 4.00   3rd Qu.:2.00  
                                         Max.   :10.00   Max.   :5.00  
        ties         
   Min.   :0.000000  
   1st Qu.:0.000000  
   Median :0.000000  
   Mean   :0.009772  
   3rd Qu.:0.000000  
   Max.   :1.000000

Adding Player Salaries

team_salary <- salaries %>%
  group_by(team_id, year) %>%
  summarize(total_salary = sum(salary, na.rm = TRUE), .groups="drop") %>%
  arrange(year, team_id)

largest_salary_year <- team_salary %>%
    group_by(year) %>%
    slice_max(order_by = total_salary, n=1, with_ties = FALSE)

  print(largest_salary_year, n=31)
  # A tibble: 31 × 3
  # Groups:   year [31]
     team_id  year total_salary
     <chr>   <int>        <int>
   1 ATL      1985     14807000
   2 NYA      1986     18494253
   3 NYA      1987     17099714
   4 NYA      1988     19441152
   5 LAN      1989     21071562
   6 KCA      1990     23361084
   7 OAK      1991     36999167
   8 TOR      1992     44788666
   9 TOR      1993     47279166
  10 ATL      1994     49383513
  11 TOR      1995     50590000
  12 BAL      1996     54490315
  13 NYA      1997     62241545
  14 BAL      1998     72355634
  15 NYA      1999     86734359
  16 NYA      2000     92338260
  17 NYA      2001    112287143
  18 NYA      2002    125928583
  19 NYA      2003    152749814
  20 NYA      2004    184193950
  21 NYA      2005    208306817
  22 NYA      2006    194663079
  23 NYA      2007    189259045
  24 NYA      2008    207896789
  25 NYA      2009    201449189
  26 NYA      2010    206333389
  27 NYA      2011    202275028
  28 NYA      2012    196522289
  29 NYA      2013    231978886
  30 LAN      2014    217014600
  31 LAN      2015    215792000

Calculating Percentage of Highest Paid Player

highest_paid <- salaries %>%
  group_by(team_id, year) %>%
  mutate(team_salary = sum(salary, na.rm = TRUE)) %>%     # team total
  slice_max(order_by = salary, n = 1, with_ties = FALSE) %>% # top player
  mutate(percentage = (salary / team_salary) * 100) %>%
  ungroup()


largest_percent_year <- highest_paid %>%
    group_by(year) %>%
    slice_max(order_by = percentage, n=1, with_ties = FALSE)

print(largest_percent_year, n=31)
  # A tibble: 31 × 7
  # Groups:   year [31]
      year team_id league_id player_id   salary team_salary percentage
     <int> <chr>   <chr>     <chr>        <int>       <int>      <dbl>
   1  1985 PHI     NL        schmimi01  2130300    10124966       21.0
   2  1986 SLN     NL        smithoz01  1940000     9875010       19.6
   3  1987 TEX     AL        harrigr01   620000      880000       70.5
   4  1988 SLN     NL        smithoz01  2340000    12880000       18.2
   5  1989 BAL     AL        ripkeca01  2466667     8275167       29.8
   6  1990 MIN     AL        puckeki01  2816667    14602000       19.3
   7  1991 MON     NL        martide01  3333333    10732333       31.1
   8  1992 MON     NL        martide01  3348333    15822334       21.2
   9  1993 FLO     NL        harvebr01  4225000    19330545       21.9
  10  1994 SDN     NL        gwynnto01  3633333    14916333       24.4
  11  1995 ML4     AL        vaughgr01  4875000    17798825       27.4
  12  1996 DET     AL        fieldce01  9237500    23438000       39.4
  13  1997 DET     AL        frymatr01  6400000    17272000       37.1
  14  1998 FLO     NL        sheffga01 14936667    41322667       36.1
  15  1999 FLO     NL        fernaal01  7000000    21085000       33.2
  16  2000 FLO     NL        fernaal01  7000000    19872000       35.2
  17  2001 MIN     AL        radkebr01  7750000    24130000       32.1
  18  2002 TBA     AL        vaughgr01  8750000    34380000       25.5
  19  2003 TOR     AL        delgaca01 18700000    51269000       36.5
  20  2004 TOR     AL        delgaca01 19700000    50017000       39.4
  21  2005 KCA     AL        sweenmi01 11000000    36881000       29.8
  22  2006 COL     NL        heltoto01 16600000    41233000       40.3
  23  2007 COL     NL        heltoto01 16600000    54041000       30.7
  24  2008 COL     NL        heltoto01 16600000    68655500       24.2
  25  2009 SDN     NL        peavyja01 11000000    43333700       25.4
  26  2010 TOR     AL        wellsve01 15687500    62234000       25.2
  27  2011 HOU     NL        leeca01   19000000    70694000       26.9
  28  2012 HOU     NL        leeca01   19000000    60651000       31.3
  29  2013 MIA     NL        nolasri01 11500000    33601900       34.2
  30  2014 HOU     AL        feldmsc01 12000000    35116300       34.2
  31  2015 PHI     NL        howarry01 25000000   111693000       22.4

Calculating WS Winners by Payroll Rank

team_salary_ranked <- salaries %>%
  group_by(team_id, year) %>%
  summarize(total_salary = sum(salary, na.rm = TRUE), .groups = "drop") %>%
  group_by(year) %>%
  mutate(salary_rank = rank(-total_salary, ties.method = "min")) %>%  # 1 = highest salary
  ungroup()

ws_winners <- playoffs %>%
  filter(round == "WS", yearID >= 1985) %>%
  select(yearID, teamIDwinner)

ws_salary_rank <- ws_winners %>%
  inner_join(team_salary_ranked, by = c("yearID" = "year", "teamIDwinner" = "team_id"))

Plotting the Data

Summary

  $summary
    avg_rank highest_rank lowest_rank
  1 8.233333            1          24
  
  $highest
    yearID teamIDwinner salary_rank
  1   1992          TOR           1
  2   1993          TOR           1
  3   1999          NYA           1
  4   2000          NYA           1
  5   2009          NYA           1
  
  $lowest
    yearID teamIDwinner salary_rank
  1   1987          MIN          24
  2   2003          FLO          24

Calculating WS Winners by Highest Paid (Relative to Team)

top_player_ranked <- highest_paid %>%
  group_by(year) %>%
  mutate(top_share_rank = rank(-percentage, ties.method = "min")) %>% # 1 = highest %
  ungroup()

ws_top_share_rank <- ws_winners %>%
  inner_join(top_player_ranked, 
             by = c("yearID" = "year", "teamIDwinner" = "team_id"))

Plotting WS Winners by Percentage

Summary

  $summary
    avg_rank highest_rank lowest_rank
  1 18.86667            3          30
  
  $highest
    yearID teamIDwinner top_share_rank
  1   1986          NYN              3
  
  $lowest
    yearID teamIDwinner top_share_rank
  1   2007          BOS             30

Creating Potential Model

calculate_ws_probability <- function(salary_rank, top_player_pct_rank) {
  
  # Base probability (1/30 teams = 3.33%)
  base_prob <- 1/30
  
  # Salary rank factor (closer to 8th = better)
  # AVG salary rank winner was 8th
  salary_factor <- exp(-(salary_rank - 8)^2 / 50)  
  
  # Distribution factor (closer to 18th = better, meaning more even distribution)
  #18th was the AVG rank for percentage of highest paid player
  distribution_factor <- exp(-(top_player_pct_rank - 18)^2 / 100) 
  
  
  probability <- base_prob * salary_factor * distribution_factor * 3  # Scale factor
  
  return(pmin(probability, 0.20))
}

team_probabilities <- team_salary_ranked %>%
  left_join(top_player_ranked, by = c("team_id", "year")) %>%
  mutate(
    ws_probability = calculate_ws_probability(salary_rank, top_share_rank),
    ws_percentage = round(ws_probability * 100, 2)
  ) %>%
  arrange(year, desc(ws_probability))

Plot the Predictions of Previous Years

ggplot(team_probabilities, aes(x = factor(year), y = reorder(team_id, ws_probability), 
                               fill = ws_percentage)) +
  geom_tile() +
  scale_fill_gradient2(low = "white", mid = "yellow", high = "red", 
                       midpoint = 6, name = "Win %") +
  labs(title = "World Series Win Probability by Team and Year",
       x = "Year", y = "Team") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90))