library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
dataset <- read_delim("/Users/matthewjobe/Downloads/quasi_winshares.csv", delim = ",")
## Rows: 98796 Columns: 16
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): name_common, player_ID, team_ID, lg_ID, def_pos, franch_id, prev_fr...
## dbl (8): age, year_ID, pct_PT, WAR162, quasi_ws, stint_ID, year_acq, year_left
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
In the code below, we filter our data frame to contain years from 2005. I chose to start with the 2005 season because there are statistics in this data frame that were not taken into account by MLB teams until 2005. After filtering the data frame, we then get 5 random samples of 10500 rows which splits the data frame in half. There is replacement within these samples, which means any row can be duplicated in any of the samples.
d1<- dataset|>
filter(2004<year_ID) ##filtering for years
df_1<- sample_n(d1,10500, replace=TRUE) #here were getting a random sample of 10500 rows
df_2<- sample_n(d1,10500, replace= TRUE)
df_3<- sample_n(d1,10500, replace=TRUE)
df_4<- sample_n(d1,10500, replace=TRUE)
df_5<- sample_n(d1,10500, replace=TRUE)
df_1
## # A tibble: 10,500 × 16
## name_common age player_ID year_ID team_ID lg_ID pct_PT WAR162 def_pos
## <chr> <dbl> <chr> <dbl> <chr> <chr> <dbl> <dbl> <chr>
## 1 Brian Bannister 25 bannibr01 2006 NYM NL 0.952 0.515 P
## 2 Nick Hagadone 28 hagadni01 2014 CLE AL 0.721 0.505 P
## 3 Fernando Abad 30 abadfe01 2016 MIN AL 1.21 0.765 P
## 4 Danny Duffy 22 duffyda01 2011 KCR AL 2.84 0.25 P
## 5 Sean Burroughs 24 burrose01 2005 SDP NL 3.10 0.58 3B, P, SS
## 6 Jason Grilli 28 grillja01 2005 DET AL 0.398 0.24 P
## 7 Andy Green 31 greenan01 2009 NYM NL 0.0497 -0.045 2B, 3B
## 8 Jacoby Ellsbury 25 ellsbja01 2009 BOS AL 6.36 2.43 CF
## 9 Brandon Lyon 32 lyonbr01 2012 TOR AL 0.810 0.475 P
## 10 Dewayne Wise 28 wisede01 2006 CIN NL 0.391 -0.605 LF, CF, …
## # ℹ 10,490 more rows
## # ℹ 7 more variables: quasi_ws <dbl>, stint_ID <dbl>, franch_id <chr>,
## # prev_franch <chr>, year_acq <dbl>, year_left <dbl>, next_franch <chr>
In the code below I grouped my samples by lg_ID. lg_ID stands for league ID (American league or National League), and split our samples into 2 groups. I also calculated the mean WAR162, quasi win share, and pct_PT (share of playing time). Quasi Win Share is a statistic that is found by taking three times total wins created per 162 games; generated by adding WAR162 to wins BELOW replacement (determined by playing time) and rounding to nearest whole number. This stat is used to measure a players overall contribution to the teams wins. pct_PT measures the share of total team playing time (measured by plate appearances and leverage-weighted innings) for player, a higher pct_PT would mean that player had more playing time.
Based on these groupings the means are all fairly consistent and there did not appear to be any anomalies. One interesting finding was that there are more players from the National League in all the samples, but mean WAR162 and Quasi Win Share is consistently higher for the American League.
df_1 |>
group_by(lg_ID) |> # Group by American League or National League
summarize(
count1 = n(), # Count
mean_WAR162 = mean(WAR162, na.rm = TRUE), # Mean WAR162 of per league
mean_quasi = mean(quasi_ws, na.rm = TRUE), # Mean WAR162, Quasi Winshare, and pct_PT of that year
mean_pct = mean(pct_PT, na.rm = TRUE)
)
## # A tibble: 2 × 5
## lg_ID count1 mean_WAR162 mean_quasi mean_pct
## <chr> <int> <dbl> <dbl> <dbl>
## 1 AL 5115 0.699 5.02 2.08
## 2 NL 5385 0.661 5.09 2.13
df_2 |>
group_by(lg_ID) |> # Group by American League or National League
summarize(
count1 = n(), # Count
mean_WAR162 = mean(WAR162, na.rm = TRUE), # Mean WAR162 of per league
mean_quasi = mean(quasi_ws, na.rm = TRUE), # Mean WAR162, Quasi Winshare, and pct_PT of that year
mean_pct = mean(pct_PT, na.rm = TRUE)
)
## # A tibble: 2 × 5
## lg_ID count1 mean_WAR162 mean_quasi mean_pct
## <chr> <int> <dbl> <dbl> <dbl>
## 1 AL 5112 0.750 5.20 2.10
## 2 NL 5388 0.712 5.28 2.16
df_3 |>
group_by(lg_ID) |> # Group by American League or National League
summarize(
count1 = n(), # Count
mean_WAR162 = mean(WAR162, na.rm = TRUE), # Mean WAR162 of per league
mean_quasi = mean(quasi_ws, na.rm = TRUE), # Mean WAR162, Quasi Winshare, and pct_PT of that year
mean_pct = mean(pct_PT, na.rm = TRUE)
)
## # A tibble: 2 × 5
## lg_ID count1 mean_WAR162 mean_quasi mean_pct
## <chr> <int> <dbl> <dbl> <dbl>
## 1 AL 5081 0.746 5.22 2.12
## 2 NL 5419 0.682 5.15 2.13
df_4 |>
group_by(lg_ID) |> # Group by American League or National League
summarize(
count1 = n(), # Count
mean_WAR162 = mean(WAR162, na.rm = TRUE), # Mean WAR162 of per league
mean_quasi = mean(quasi_ws, na.rm = TRUE), # Mean WAR162, Quasi Winshare, and pct_PT of that year
mean_pct = mean(pct_PT, na.rm = TRUE)
)
## # A tibble: 2 × 5
## lg_ID count1 mean_WAR162 mean_quasi mean_pct
## <chr> <int> <dbl> <dbl> <dbl>
## 1 AL 5001 0.744 5.25 2.13
## 2 NL 5499 0.707 5.27 2.16
df_5 |>
group_by(lg_ID) |> # Group by American League or National League
summarize(
count1 = n(), # Count
mean_WAR162 = mean(WAR162, na.rm = TRUE), # Mean WAR162 of per league
mean_quasi = mean(quasi_ws, na.rm = TRUE), # Mean WAR162, Quasi Winshare, and pct_PT of that year
mean_pct = mean(pct_PT, na.rm = TRUE)
)
## # A tibble: 2 × 5
## lg_ID count1 mean_WAR162 mean_quasi mean_pct
## <chr> <int> <dbl> <dbl> <dbl>
## 1 AL 5133 0.747 5.22 2.11
## 2 NL 5367 0.663 5.04 2.08
In the code below I grouped the samples by year, and examined the count of players in the samples selected from each specific year.
Within these samples, there do not appear to be any anomalies for these groupings either. The counts and means look to be similar for each sample.
df_1 |>
group_by(year_ID) |> # Group by year
summarize(
count1 = n(), # Count of players that played during that year
mean_WAR162 = mean(WAR162, na.rm = TRUE),
mean_quasi = mean(quasi_ws, na.rm = TRUE), # Mean WAR162, Quasi Winshare, and pct_PT of that year
mean_pct = mean(pct_PT, na.rm = TRUE)
)
## # A tibble: 15 × 5
## year_ID count1 mean_WAR162 mean_quasi mean_pct
## <dbl> <int> <dbl> <dbl> <dbl>
## 1 2005 678 0.694 5.29 2.25
## 2 2006 672 0.718 5.22 2.13
## 3 2007 679 0.690 5.12 2.11
## 4 2008 665 0.650 5.03 2.13
## 5 2009 676 0.750 5.39 2.20
## 6 2010 686 0.669 5.16 2.21
## 7 2011 709 0.670 5.07 2.12
## 8 2012 701 0.777 5.61 2.30
## 9 2013 674 0.702 5.08 2.06
## 10 2014 699 0.666 4.94 2.06
## 11 2015 703 0.635 4.83 2.03
## 12 2016 714 0.720 5.22 2.15
## 13 2017 749 0.673 4.94 2.05
## 14 2018 726 0.608 4.59 1.92
## 15 2019 769 0.589 4.49 1.86
df_2 |>
group_by(year_ID) |> # Group by year
summarize(
count1 = n(), # Count of players that played during that year
mean_WAR162 = mean(WAR162, na.rm = TRUE),
mean_quasi = mean(quasi_ws, na.rm = TRUE), # Mean WAR162, Quasi Winshare, and pct_PT of that year
mean_pct = mean(pct_PT, na.rm = TRUE)
)
## # A tibble: 15 × 5
## year_ID count1 mean_WAR162 mean_quasi mean_pct
## <dbl> <int> <dbl> <dbl> <dbl>
## 1 2005 681 0.753 5.50 2.28
## 2 2006 700 0.782 5.50 2.22
## 3 2007 676 0.689 5.23 2.21
## 4 2008 633 0.811 5.61 2.22
## 5 2009 627 0.824 5.68 2.25
## 6 2010 712 0.782 5.49 2.19
## 7 2011 643 0.727 5.55 2.36
## 8 2012 672 0.647 4.90 2.07
## 9 2013 656 0.802 5.44 2.11
## 10 2014 715 0.668 4.96 2.06
## 11 2015 747 0.737 5.25 2.12
## 12 2016 715 0.683 4.96 2.03
## 13 2017 740 0.725 5.06 2.02
## 14 2018 788 0.687 4.93 1.96
## 15 2019 795 0.676 4.80 1.90
df_3 |>
group_by(year_ID) |> # Group by year
summarize(
count1 = n(), # Count of players that played during that year
mean_WAR162 = mean(WAR162, na.rm = TRUE),
mean_quasi = mean(quasi_ws, na.rm = TRUE), # Mean WAR162, Quasi Winshare, and pct_PT of that year
mean_pct = mean(pct_PT, na.rm = TRUE)
)
## # A tibble: 15 × 5
## year_ID count1 mean_WAR162 mean_quasi mean_pct
## <dbl> <int> <dbl> <dbl> <dbl>
## 1 2005 653 0.853 5.93 2.34
## 2 2006 665 0.743 5.34 2.21
## 3 2007 667 0.690 5.17 2.16
## 4 2008 708 0.681 5.16 2.17
## 5 2009 707 0.657 5.09 2.19
## 6 2010 639 0.833 5.71 2.23
## 7 2011 625 0.759 5.41 2.18
## 8 2012 662 0.657 4.99 2.09
## 9 2013 729 0.663 5.10 2.15
## 10 2014 684 0.709 5.20 2.14
## 11 2015 736 0.754 5.31 2.11
## 12 2016 741 0.655 4.81 1.97
## 13 2017 754 0.692 4.99 2.03
## 14 2018 737 0.739 5.02 1.95
## 15 2019 793 0.644 4.79 1.99
df_4 |>
group_by(year_ID) |> # Group by year
summarize(
count1 = n(), # Count of players that played during that year
mean_WAR162 = mean(WAR162, na.rm = TRUE),
mean_quasi = mean(quasi_ws, na.rm = TRUE), # Mean WAR162, Quasi Winshare, and pct_PT of that year
mean_pct = mean(pct_PT, na.rm = TRUE)
)
## # A tibble: 15 × 5
## year_ID count1 mean_WAR162 mean_quasi mean_pct
## <dbl> <int> <dbl> <dbl> <dbl>
## 1 2005 654 0.807 5.80 2.35
## 2 2006 643 0.792 5.70 2.31
## 3 2007 620 0.800 5.73 2.32
## 4 2008 670 0.678 5.04 2.10
## 5 2009 665 0.774 5.46 2.15
## 6 2010 661 0.737 5.36 2.20
## 7 2011 680 0.858 5.91 2.31
## 8 2012 710 0.667 5.05 2.11
## 9 2013 714 0.759 5.44 2.22
## 10 2014 773 0.680 5.10 2.12
## 11 2015 738 0.746 5.32 2.14
## 12 2016 682 0.675 4.95 2.03
## 13 2017 743 0.692 5.06 2.09
## 14 2018 781 0.624 4.68 1.94
## 15 2019 766 0.629 4.64 1.91
df_5|>
group_by(year_ID) |> # Group by year
summarize(
count1 = n(), # Count of players that played during that year
mean_WAR162 = mean(WAR162, na.rm = TRUE),
mean_quasi = mean(quasi_ws, na.rm = TRUE), # Mean WAR162, Quasi Winshare, and pct_PT of that year
mean_pct = mean(pct_PT, na.rm = TRUE)
)
## # A tibble: 15 × 5
## year_ID count1 mean_WAR162 mean_quasi mean_pct
## <dbl> <int> <dbl> <dbl> <dbl>
## 1 2005 644 0.697 5.32 2.26
## 2 2006 686 0.680 5.08 2.11
## 3 2007 729 0.697 5.15 2.12
## 4 2008 681 0.773 5.58 2.26
## 5 2009 679 0.642 4.77 1.97
## 6 2010 667 0.745 5.35 2.20
## 7 2011 706 0.771 5.35 2.12
## 8 2012 698 0.778 5.43 2.17
## 9 2013 664 0.733 5.37 2.19
## 10 2014 704 0.675 5 2.07
## 11 2015 765 0.699 5.00 2.03
## 12 2016 691 0.680 4.98 2.03
## 13 2017 716 0.717 5.04 2.02
## 14 2018 712 0.695 4.95 2.01
## 15 2019 758 0.595 4.65 1.97
Below, I wanted to get an idea for how the means for WAR162, Quasi Win Share, and share of playing time for these samples compare to each other. Maximum values for these stats are also provided. Once again, they are all similar.
df_1 |>
summarize(
mean_WAR162 = mean(WAR162, na.rm = TRUE),
mean_quasi = mean(quasi_ws, na.rm = TRUE), # Mean WAR162, Quasi Win Share, and pct_PT of that year
mean_pct = mean(pct_PT, na.rm = TRUE),
max_WAR162= max(WAR162, na.rm = TRUE), #max value
max_quasi= max(quasi_ws, na.rm = TRUE),
max_pct= max(pct_PT, na.rm = TRUE)
)
## # A tibble: 1 × 6
## mean_WAR162 mean_quasi mean_pct max_WAR162 max_quasi max_pct
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.680 5.06 2.10 9.98 38 7.41
df_2 |>
summarize(
mean_WAR162 = mean(WAR162, na.rm = TRUE),
mean_quasi = mean(quasi_ws, na.rm = TRUE), # Mean WAR162, Quasi Win Share, and pct_PT of that year
mean_pct = mean(pct_PT, na.rm = TRUE),
max_WAR162= max(WAR162, na.rm = TRUE), #max value
max_quasi= max(quasi_ws, na.rm = TRUE),
max_pct= max(pct_PT, na.rm = TRUE)
)
## # A tibble: 1 × 6
## mean_WAR162 mean_quasi mean_pct max_WAR162 max_quasi max_pct
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.731 5.24 2.13 10.1 39 7.48
df_3 |>
summarize(
mean_WAR162 = mean(WAR162, na.rm = TRUE),
mean_quasi = mean(quasi_ws, na.rm = TRUE), # Mean WAR162, Quasi Win Share, and pct_PT of that year
mean_pct = mean(pct_PT, na.rm = TRUE),
max_WAR162= max(WAR162, na.rm = TRUE),
max_quasi= max(quasi_ws, na.rm = TRUE), #max value
max_pct= max(pct_PT, na.rm = TRUE)
)
## # A tibble: 1 × 6
## mean_WAR162 mean_quasi mean_pct max_WAR162 max_quasi max_pct
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.713 5.19 2.12 10.6 41 7.41
df_4 |>
summarize(
mean_WAR162 = mean(WAR162, na.rm = TRUE),
mean_quasi = mean(quasi_ws, na.rm = TRUE), # Mean WAR162, Quasi Win Share, and pct_PT of that year
mean_pct = mean(pct_PT, na.rm = TRUE),
max_WAR162= max(WAR162, na.rm = TRUE),
max_quasi= max(quasi_ws, na.rm = TRUE), #max value
max_pct= max(pct_PT, na.rm = TRUE)
)
## # A tibble: 1 × 6
## mean_WAR162 mean_quasi mean_pct max_WAR162 max_quasi max_pct
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.725 5.26 2.15 10.1 39 7.41
df_5 |>
summarize(
mean_WAR162 = mean(WAR162, na.rm = TRUE),
mean_quasi = mean(quasi_ws, na.rm = TRUE), # Mean WAR162, Quasi Win Share, and pct_PT of that year
mean_pct = mean(pct_PT, na.rm = TRUE),
max_WAR162= max(WAR162, na.rm = TRUE),
max_quasi= max(quasi_ws, na.rm = TRUE), #max value
max_pct= max(pct_PT, na.rm = TRUE)
)
## # A tibble: 1 × 6
## mean_WAR162 mean_quasi mean_pct max_WAR162 max_quasi max_pct
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.704 5.13 2.10 10.6 41 7.48
Unfortunately, I was unable to locate any anomalies within the three samples. While each sample had a different number of players from each year, means for different statistics differed, etc., there was nothing significant. All of the measures that we looked were very similar and consistent.
An anomaly would have been a significantly higher mean for any of these categories. For instance, if sample 1 and 3 had a mean WAR162 of around 0.7 but sample 2’s was 2.1 then something might be going on. As for the grouping by league, if there were 7,500 players from the “NL” and only 3,000 from the “AL” then that would be an anomaly.
While I did not find any major differences within the samples, I can recognize that each sample is different. When drawing conclusion about data you are often not getting information about the population, but only a small sample. While samples can be a great representation of that population, it can differ. My conclusions could be made based on a sample that does not directly resemble that of the population.