library(tidyverse)
library(knitr)
library(dplyr)
library(ggplot2)

Tidying the Data

In the following code block, I load the csv of Marley’s baseball data and glimpse the data frame. I am interested in comparing how different positions fare in hitting home runs over time.

baseball_df <- read.csv(file = "/Users/mollysiebecker/DATA 607/Baseball Project 2 Data.csv")
head(baseball_df)
##   Team Position X2018.HRs X2019.HRs X2021.HRs X2022.HRs X2023.HRs
## 1  BAL        C         3        13        11        13        20
## 2            1B        16        12        33        22        18
## 3            2B        17        24         5        13        13
## 4            3B        24         6         9        13         7
## 5            SS         7        12        11        16         4
## 6            LF        24        13        22        16        16

Below, I rename the columns, fill in missing team values, and pivot longer.

baseball_df <- baseball_df %>%
   rename("team" = "Team", "position" = "Position", "2018" = "X2018.HRs", "2019" = "X2019.HRs", "2021" = "X2021.HRs", "2022" = "X2022.HRs", "2023" = "X2023.HRs") %>%
  mutate(team = na_if(team, "")) %>%
  fill(team) %>%
  pivot_longer(cols = 3:7, names_to = "year", values_to = "home_runs")

Finally, I replace values for greater clarity and display the first 40 rows using kable.

baseball_df$team[baseball_df$team == "BAL"] <- "Baltimore Orioles"
baseball_df$team[baseball_df$team == "BOS"] <- "Boston Red Sox"
baseball_df$team[baseball_df$team == "NYY"] <- "New York Yankees"
baseball_df$team[baseball_df$team == "TBR"] <- "Tampa Bay Rays"
baseball_df$team[baseball_df$team == "TOR"] <- "Toronto Blue Jays"
baseball_df$position[baseball_df$position == "C"] <- "catcher"
baseball_df$position[baseball_df$position == "1B"] <- "first base"
baseball_df$position[baseball_df$position == "2B"] <- "second base"
baseball_df$position[baseball_df$position == "3B"] <- "third base"
baseball_df$position[baseball_df$position == "SS"] <- "shortstop"
baseball_df$position[baseball_df$position == "LF"] <- "left field"
baseball_df$position[baseball_df$position == "CF"] <- "center field"
baseball_df$position[baseball_df$position == "RF"] <- "right field"
baseball_df$position[baseball_df$position == "DH"] <- "designated hitter"

baseball_subset <- baseball_df[1:40, ]
kable(baseball_subset, format = "pipe", col.names = c("Team", "Position", "Year", "Number of Home Runs"), caption = "Number of Home Runs by Team, Position, and Year", align = "c")
Number of Home Runs by Team, Position, and Year
Team Position Year Number of Home Runs
Baltimore Orioles catcher 2018 3
Baltimore Orioles catcher 2019 13
Baltimore Orioles catcher 2021 11
Baltimore Orioles catcher 2022 13
Baltimore Orioles catcher 2023 20
Baltimore Orioles first base 2018 16
Baltimore Orioles first base 2019 12
Baltimore Orioles first base 2021 33
Baltimore Orioles first base 2022 22
Baltimore Orioles first base 2023 18
Baltimore Orioles second base 2018 17
Baltimore Orioles second base 2019 24
Baltimore Orioles second base 2021 5
Baltimore Orioles second base 2022 13
Baltimore Orioles second base 2023 13
Baltimore Orioles third base 2018 24
Baltimore Orioles third base 2019 6
Baltimore Orioles third base 2021 9
Baltimore Orioles third base 2022 13
Baltimore Orioles third base 2023 7
Baltimore Orioles shortstop 2018 7
Baltimore Orioles shortstop 2019 12
Baltimore Orioles shortstop 2021 11
Baltimore Orioles shortstop 2022 16
Baltimore Orioles shortstop 2023 4
Baltimore Orioles left field 2018 24
Baltimore Orioles left field 2019 13
Baltimore Orioles left field 2021 22
Baltimore Orioles left field 2022 16
Baltimore Orioles left field 2023 16
Baltimore Orioles center field 2018 15
Baltimore Orioles center field 2019 10
Baltimore Orioles center field 2021 30
Baltimore Orioles center field 2022 16
Baltimore Orioles center field 2023 15
Baltimore Orioles right field 2018 8
Baltimore Orioles right field 2019 35
Baltimore Orioles right field 2021 18
Baltimore Orioles right field 2022 33
Baltimore Orioles right field 2023 28

Analysis

Below, I create a new variable called “position_group” that indicates whether players are in the infield or outfield, or simply keeps the values of “catcher” or “designated hitter.” Then, I create a new data frame that calculates the average number of home runs for each position group, per year.

baseball_df <- baseball_df %>%
  mutate(position_group = case_when(
    position == "catcher" ~ "catcher",
    position == "designated hitter" ~ "designated hitter",
    position == "first base"|position == "second base"|position == "third base"|position == "shortstop" ~ "infield",
    position == "left field"|position == "center field"|position == "right field" ~ "outfield"
  )) 

baseball_summary <- baseball_df %>%
  group_by(position_group, year) %>%
  summarize(avg_home_runs = mean(home_runs), .groups = "drop")

Finally, I coerce the “year” variable to be an integer, and display the data in a line graph. I include specific points for each value so as not to mislead the viewer into thinking there were data points for 2020.

baseball_summary$year <- as.integer(baseball_summary$year)

ggplot(baseball_summary, aes(x = year, y = avg_home_runs, color = position_group)) + 
  geom_point() +
  geom_line() +
  labs(title="Average Number of Home Runs per Year by Position", x="Year", y="Average Number of Home Runs")

Findings and Recommendations

In almost each year measured, the designated hitters hit more home runs than any other position, which makes sense since they are especially selected for that role. Again, in almost every year measured, catchers hit the least number of home runs, which also makes sense since their position is the most specialized for defense and not offense. The outfielders generally hit more home runs than the infielders, until 2022, when all positions showed a sharp decrease in the number of home runs hit (this was also the only year in which the designated hitters did not hit the most home runs.) From 2022 to 2023, the number of home runs hit started to rebound again, regardless of position. Further analysis should take into account other factors in order to try to determine what might have caused the dip in home runs in 2022.