library(pitchRx) # data pack
  library(RSQLite) # loading in data of c drive
  library(readr) # read in files
  library(dplyr) # cleaing data
  library(tidyr) # cleaning data
  library(forcats) # cleaning data
  library(lubridate) # cleaing dates
  library(outliers) # outliers
  library(corrplot) # correlation plot
  library(RColorBrewer) # colour package
  library(viridis) # colour package
  library(ggplot2) # plots
  library(kableExtra) # html tabel
  library(ggthemes) # plot themes
  library(tinytex) # knit to pdf
  library(knitr) # kniting
  library(prettydoc) # rmarkdown theme

Data Wrangling

The data wrangling process is split into several steps

Get

  database.2010 <- src_sqlite("C:/Data/Baseball/pitchRx 2010.sqlite3", create = FALSE)
  database.2011 <- src_sqlite("C:/Data/Baseball/pitchRx 2011.sqlite3", create = FALSE)
  database.2012 <- src_sqlite("C:/Data/Baseball/pitchRx 2012.sqlite3", create = FALSE)
  database.2013 <- src_sqlite("C:/Data/Baseball/pitchRx 2013.sqlite3", create = FALSE)
  database.2014 <- src_sqlite("C:/Data/Baseball/pitchRx 2014.sqlite3", create = FALSE)
  database.2015 <- src_sqlite("C:/Data/Baseball/pitchRx 2015.sqlite3", create = FALSE)
  database.2016 <- src_sqlite("C:/Data/Baseball/pitchRx 2016.sqlite3", create = FALSE)
  database.2017 <- src_sqlite("C:/Data/Baseball/pitchRx 2017.sqlite3", create = FALSE)
  database.2018 <- src_sqlite("C:/Data/Baseball/pitchRx 2018.sqlite3", create = FALSE)
# 2010
  # pitch table
    pitches.2010 <- tbl(database.2010, "pitch") %>% as.data.frame()
    pitches.2010 <- pitches.2010[ , c(1, 3, 4, 10, 16, 17, 31, 30, 33 ,42:47)]
  # atbat table
    atbat.2010 <- tbl(database.2010, "atbat") %>% as.data.frame()
    atbat.2010 <- atbat.2010[ , c(1, 2, 3, 6, 9, 11, 12, 14, 16, 17, 19, 20, 24, 25, 27, 28)]
  # merge
    total.2010 <- inner_join(pitches.2010, atbat.2010, by = c("num", "gameday_link"))
  # filter
    chris.sale.2010 <- total.2010 %>% filter(pitcher_name == "Chris Sale")

# 2011
  # pitch table
    pitches.2011 <- tbl(database.2011, "pitch") %>% as.data.frame()
    pitches.2011 <- pitches.2011[ , c(1, 3, 4, 10, 16, 17, 31, 30, 33 ,42:47)]
  # atbat table
    atbat.2011 <- tbl(database.2011, "atbat") %>% as.data.frame()
    atbat.2011 <- atbat.2011[ , c(1, 2, 3, 6, 9, 11, 12, 14, 16, 17, 19, 20, 24, 25, 27, 28)]
  # merge
    total.2011 <- inner_join(pitches.2011, atbat.2011, by = c("num", "gameday_link"))
  # filter
    chris.sale.2011 <- total.2011 %>% filter(pitcher_name == "Chris Sale")

# 2012
  # pitch table
    pitches.2012 <- tbl(database.2012, "pitch") %>% as.data.frame()
    pitches.2012 <- pitches.2012[ , c(1, 3, 4, 10, 16, 17, 31, 30, 33 ,42:47)]
  # atbat table
    atbat.2012 <- tbl(database.2012, "atbat") %>% as.data.frame()
    atbat.2012 <- atbat.2012[ , c(1, 2, 3, 6, 9, 11, 12, 14, 16, 17, 19, 20, 24, 25, 27, 28)]
  # merge
    total.2012 <- inner_join(pitches.2012, atbat.2012, by = c("num", "gameday_link"))
  # filter
    chris.sale.2012 <- total.2012 %>% filter(pitcher_name == "Chris Sale")
    
# 2013
  # pitch table
    pitches.2013 <- tbl(database.2013, "pitch") %>% as.data.frame()
    pitches.2013 <- pitches.2013[ , c(1, 3, 4, 10, 16, 17, 31, 30, 33 ,42:47)]
  # atbat table
    atbat.2013 <- tbl(database.2013, "atbat") %>% as.data.frame()
    atbat.2013 <- atbat.2013[ , c(1, 2, 3, 6, 9, 11, 12, 14, 16, 17, 19, 20, 24, 25, 27, 28)]
  # merge
    total.2013 <- inner_join(pitches.2013, atbat.2013, by = c("num", "gameday_link"))
  # filter
    chris.sale.2013 <- total.2013 %>% filter(pitcher_name == "Chris Sale")
  
# 2014
  # pitch table
    pitches.2014 <- tbl(database.2014, "pitch") %>% as.data.frame()
    pitches.2014 <- pitches.2014[ , c(1, 3, 4, 10, 16, 17, 31, 30, 33 ,42:47)]
  # atbat table
    atbat.2014 <- tbl(database.2014, "atbat") %>% as.data.frame()
    atbat.2014 <- atbat.2014[ , c(1, 2, 3, 6, 9, 11, 12, 14, 16, 17, 19, 20, 24, 25, 27, 28)]
  # merge
    total.2014 <- inner_join(pitches.2014, atbat.2014, by = c("num", "gameday_link"))
  # filter
    chris.sale.2014 <- total.2014 %>% filter(pitcher_name == "Chris Sale")
    
# 2015
  # pitch table
    pitches.2015 <- tbl(database.2015, "pitch") %>% as.data.frame()
    pitches.2015 <- pitches.2015[ , c(1, 3, 4, 10, 16, 17, 31, 30, 33 ,42:47)]
  # atbat table
    atbat.2015 <- tbl(database.2015, "atbat") %>% as.data.frame()
    atbat.2015 <- atbat.2015[ , c(1, 2, 3, 6, 9, 11, 12, 14, 16, 17, 19, 20, 24, 25, 27, 28)]
  # merge
    total.2015 <- inner_join(pitches.2015, atbat.2015, by = c("num", "gameday_link"))
  # filter
    chris.sale.2015 <- total.2015 %>% filter(pitcher_name == "Chris Sale")
    
# 2016
  # pitch table
    pitches.2016 <- tbl(database.2016, "pitch") %>% as.data.frame()
    pitches.2016 <- pitches.2016[ , c(1, 3, 4, 10, 16, 17, 31, 30, 33 ,42:47)]
  # atbat table
    atbat.2016 <- tbl(database.2016, "atbat") %>% as.data.frame()
    atbat.2016 <- atbat.2016[ , c(1, 2, 3, 6, 9, 11, 12, 14, 16, 17, 19, 20, 24, 25, 27, 28)]
  # merge
    total.2016 <- inner_join(pitches.2016, atbat.2016, by = c("num", "gameday_link"))
  # filter
    chris.sale.2016 <- total.2016 %>% filter(pitcher_name == "Chris Sale")
    
# 2017
  # pitch table
    pitches.2017 <- tbl(database.2017, "pitch") %>% as.data.frame()
    pitches.2017 <- pitches.2017[ , c(1, 3, 4, 10, 16, 17, 31, 30, 33 ,42:47)]
  # atbat table
    atbat.2017 <- tbl(database.2017, "atbat") %>% as.data.frame()
    atbat.2017 <- atbat.2017[ , c(1, 2, 3, 6, 9, 11, 12, 14, 16, 17, 19, 20, 24, 25, 27, 28)]
  # merge
    total.2017 <- inner_join(pitches.2017, atbat.2017, by = c("num", "gameday_link"))
  # filter
    chris.sale.2017 <- total.2017 %>% filter(pitcher_name == "Chris Sale")
    
# 2018
  # pitch table
    pitches.2018 <- tbl(database.2018, "pitch") %>% as.data.frame()
    pitches.2018 <- pitches.2018[ , c(1, 3, 4, 10, 16, 17, 31, 30, 33 ,42:47)]
  # atbat table
    atbat.2018 <- tbl(database.2018, "atbat") %>% as.data.frame()
    atbat.2018 <- atbat.2018[ , c(1, 2, 3, 6, 9, 11, 12, 14, 16, 17, 19, 20, 24, 25, 27, 28)]
  # merge
    total.2018 <- inner_join(pitches.2018, atbat.2018, by = c("num", "gameday_link"))
  # filter
    chris.sale.2018 <- total.2018 %>% filter(pitcher_name == "Chris Sale")
# total merge
  chris.sale.total <- chris.sale.2010 %>% bind_rows(chris.sale.2011)
  chris.sale.total <- chris.sale.total %>% bind_rows(chris.sale.2012)
  chris.sale.total <- chris.sale.total %>% bind_rows(chris.sale.2013)
  chris.sale.total <- chris.sale.total %>% bind_rows(chris.sale.2014)
  chris.sale.total <- chris.sale.total %>% bind_rows(chris.sale.2015)
  chris.sale.total <- chris.sale.total %>% bind_rows(chris.sale.2016)
  chris.sale.total <- chris.sale.total %>% bind_rows(chris.sale.2017)
  chris.sale.total <- chris.sale.total %>% bind_rows(chris.sale.2018)

Understand

# pitch Outcome 
  # strikes
    chris.sale.total$pitch_outcome <- ifelse(chris.sale.total$type == "S", chris.sale.total$des, NA) 
  # Balls
    chris.sale.total$pitch_outcome <- ifelse(chris.sale.total$type == "B", 
                                             "Ball", chris.sale.total$pitch_outcome)
  # balls in play
    chris.sale.total$pitch_outcome <- ifelse(chris.sale.total$type == "X", 
                                             chris.sale.total$event, chris.sale.total$pitch_outcome)

# Team
  chris.sale.total$team <- ifelse(chris.sale.total$date > 2016, "Boston", "White Sox")
  chris.sale.total$team <- factor(chris.sale.total$team, levels = c("White Sox", "Boston"))
# Changing variable type
  # To factor
    chris.sale.total[, c("des", "type", "pitch_type", "count", "o", "stand", "p_throws", "event", 
                         "inning_side", "outcome")] <- 
      lapply(chris.sale.total[, c("des", "type", "pitch_type", "count", "o", "stand", "p_throws",
                                  "event","inning_side", "pitch_outcome")], as.factor)
  # To Logical
    chris.sale.total[, c("on_1b", "on_2b", "on_3b")] <- 
      lapply(chris.sale.total[, c("on_1b", "on_2b", "on_3b")], as.logical)
    # Replacing NA, with FALSE  
      chris.sale.total$on_1b <- ifelse(is.na(chris.sale.total$on_1b), FALSE, chris.sale.total$on_1b)
      chris.sale.total$on_2b <- ifelse(is.na(chris.sale.total$on_2b), FALSE, chris.sale.total$on_2b)
      chris.sale.total$on_3b <- ifelse(is.na(chris.sale.total$on_3b), FALSE, chris.sale.total$on_3b)  
  # To Date
    chris.sale.total[, "date"] <- chris.sale.total[, "date"] %>% ymd()
# Home team and away team NA to zero
  chris.sale.total$home_team_runs <- ifelse(is.na(chris.sale.total$home_team_runs), 
                                            0, chris.sale.total$home_team_runs)
  chris.sale.total$away_team_runs <- ifelse(is.na(chris.sale.total$away_team_runs), 
                                            0, chris.sale.total$away_team_runs)
# Type
  levels(chris.sale.total$type) <- list("Ball" = "B",
                                        "Strike/Fouls" = "S",
                                        "Ball in play" = "X")
# Pitch Type
  levels(chris.sale.total$pitch_type) <- list("Two-Seam Fastball" = "FT",
                                              "Slider" = "SL",
                                              "Changeup" = "CH",
                                              "Four-Seam Fastball" = "FF",
                                              "Intentional Walk" = "IN",
                                              "Fastball" = "FA",
                                              "Unknow" = "UN",
                                              "Pitch Out" = "PO",
                                              "Splitter" = "FS",
                                              "Curveball" = "CU")
# Outcome
  chris.sale.total$pitch_outcome <- chris.sale.total$pitch_outcome %>% 
                                      fct_collapse("Ball" ="Ball",
                                                   "Strike Looking"  = "Called Strike",
                                                   "Swinging Strike" = c("Missed Bunt", 
                                                                         "Swinging Strike",
                                                                        "Swinging Strike (Blocked)"),
                                                   "Foul" = c("Foul", "Foul (Runner Going)",
                                                              "Foul Bunt", "Foul Tip"),
                                                   "Single" = "Single",
                                                   "Double" = "Double",
                                                   "Triple" = "Triple",
                                                   "Home Run" = "Home Run",
                                                   "Out" = c("Lineout", "Grounded Into DP",
                                                             "Flyout", "Groundout", "Sac Bunt",
                                                             "Forceout", "Pop Out", "Bunt Groundout",
                                                             "Bunt Pop Out", "Double Play", 
                                                             "Sac Fly",
                                                             "Fielders Choice Out", "Triple Play", 
                                                             "Fielders Choice"),
                                                   "Error" = c("Field Error", "Batter Interference",
                                                               "Fan interference"))
# stand
  levels(chris.sale.total$stand) <- list("Left" = "L",
                                         "Right" = "R")
  varible <- read_csv("varibles.csv")

  varible$`Outcome of the pitch` <- ifelse(is.na(varible$X4),
                                           varible$`Outcome of the pitch`,
                                           paste(varible$`Outcome of the pitch`, varible$X4))
  
  varible[, 1:3] %>% kable() %>% kable_styling("striped", full_width = F)
Des factor Outcome of the pitch
Id numeric unique identification number per pitch within a game.
type factor weather the pitch was a ball strike or foul
start_speed numeric speed when the pitch is thrown in mph
px numeric horizontal distance in feet from the centre of home plate as the ball crosses home plate
pz numeric vertical distance in feet from the centre of home plate as the ball crosses home plate
type_confidence numeric the value of weight to the probability the pitch type is correct
pitch_type factor most probable pitch type thrown
nasty numeric how hard a ball is to hit
num numeric used to help join tables
on_1b logical if a player is on first base
on_2b logical if a player is on second base
on_3b logical if a player is on third base
count factor how many balls and strikes there are
gameday_link character link to gameday data (used to join tables)
pitcher numeric id code for the pitcher
batter numeric id code for the batter
o factor number of outs
stand factor if the batter is left or right handed
throws factor if the pitcher left or right handed
atbat_des character what the outcome of the at bat was
event factor simple outcome
home_team_runs numeric score for the home team
away_team_runs numeric score for the away team
inning_side factor if it’s the top or bottom of an innings
inning numeric what innings it is
batter_name character name of the batter
pitcher_name character name of the pitcher
date date date
outcome factor added simpler outcome variable
team factor what team does Sale play for.

Tidy

Data Checking

  # Creating data frame to plot all numerical baribles
    plot.table <- chris.sale.total[, c("nasty", "inning", "type_confidence", "start_speed")] %>% 
      gather(key = "variable", value = "value")
  # plot 1
    ggplot(data = plot.table, aes(x = variable, y = value)) + geom_violin() + 
      geom_boxplot(width=0.1, outlier.colour = 'red') + 
      facet_wrap( ~ variable, scales = "free") + theme_fivethirtyeight()
**Plot 1:** Outliers

Plot 1: Outliers

  # Function to Remove Outliers Using Outlier Detection 
    outlier.qr <- function(data, var_name)
      {
        data$var_name <- eval(substitute(var_name), data) 
        iqr <- IQR(data$`var_name`, na.rm = TRUE)
        qr <- quantile(data$var_name, c(0.25,0.75))
        Q1 <- qr[[1]] - 1.5*iqr
        Q3 <- qr[[2]] + 1.5*iqr
        data <- data %>% filter(var_name < Q3 & var_name > Q1)
        return(data)
    }

  # removing outliers form type_confidence
    chris.sale.total <- outlier.qr(chris.sale.total, type_confidence)
# list of numerical varible
  numerical.varible <- c("start_speed", "px", "pz", "nasty", "inning")
# colour brewer
  pitch_col <- brewer.pal(length(unique(chris.sale.total$pitch_type)),"Set1")
  # pair
    pairs(chris.sale.total[, numerical.varible], lower.panel = NULL, col = pitch_col)
  # add ledgend
    par(xpd = TRUE)
      legend(x = 0.05, y = 0.5, cex = 1, legend = as.character(levels(chris.sale.total$pitch_type)), 
             fill = unique(pitch_col))
**Plot 2:** Corelation Plot 1

Plot 2: Corelation Plot 1

    par(xpd = NA)
# cor plot    
  chris.sale.total[, numerical.varible] %>% cor() %>% corrplot(method = "color", type="upper")
**Plot 3:** Corelation PLot 2

Plot 3: Corelation PLot 2

Data Exploration

  # Formate data
    # Pitches by date and type
      pitch.by.year <- table(year(chris.sale.total$date), chris.sale.total$pitch_type, 
                             dnn = c("year", "pitch_type")) %>% as.data.frame()
    # Pitches by date
      total.pitch.year <- table(year(chris.sale.total$date), dnn = "year") %>% as.data.frame()
    # Join and order
      pitch.by.year <- full_join(pitch.by.year, total.pitch.year, by = "year")
    # Formate and adding percentage as well as year days
      colnames(pitch.by.year) <- c("year", "pitch_type", "type_freq", "year_freq")
      pitch.by.year$perc <- pitch.by.year$type_freq / pitch.by.year$year_freq
      pitch.by.year$year <- pitch.by.year$year %>% as.Date("%Y")
      pitch.by.year$year <- pitch.by.year$year %>% year()
    # Removing zeros
      pitch.by.year <- pitch.by.year %>% filter(type_freq != 0)
  # plot
    ggplot(pitch.by.year, aes(x = year, y = perc)) + 
      geom_line(aes(colour = pitch_type), size = 1) + 
      geom_point(fill = "black", aes(colour = pitch_type)) +
      geom_vline(xintercept = 2016.5, linetype = "dashed", colour = "#666666") +
      geom_text(x = 2016.7, y = 0.6, label = "traded", colour = "#666666", size = 6) +
      theme_fivethirtyeight() + scale_colour_brewer(palette = 'Set1')
**Plot 4:** Frequncey By Year

Plot 4: Frequncey By Year

  # Formate data
    # Pitches by date and type
      pitch.by.date <- table(chris.sale.total$date, chris.sale.total$pitch_type, 
                             dnn = c("date", "pitch_type")) %>% as.data.frame()
    # Pitches by date
      total.pitch.date <- table(chris.sale.total$date, dnn = "date") %>% as.data.frame()
    # Join and order
      pitch.by.date <- full_join(pitch.by.date, total.pitch.date, by = "date")
      pitch.by.date <- pitch.by.date %>% arrange(date)
    # Formate and adding percentage as well as year days
      colnames(pitch.by.date) <- c("date", "pitch_type", "type_freq", "date_freq")
      pitch.by.date$perc <- pitch.by.date$type_freq / pitch.by.date$date_freq
      pitch.by.date$date <- pitch.by.date$date %>% ymd()
      pitch.by.date$date.yd <- pitch.by.date$date %>% yday()
    # Nemoving zeros
      pitch.by.date <- pitch.by.date %>% filter(type_freq != 0)

    
  # time series (ggplot)
    ggplot(pitch.by.date, aes(x = date.yd, y = perc)) + 
      geom_line(aes(colour = pitch_type), size = 1) +
      theme_grey() + facet_grid(year(date)~pitch_type) + 
      scale_colour_brewer(palette = 'Set1') + xlab("Year Days") + ylab("Percentage") + 
      theme_fivethirtyeight()
**Plot 5:** Frequncey By Year Days

Plot 5: Frequncey By Year Days

  # Home plate and ave. strike zone
    # ave strike zone
      strike.zone <- geom_rect(aes(xmin = -0.95, xmax = 0.95, ymin = 1.6, ymax = 3.5), fill = NA, 
                               color = "#ff6116", size = 0.4)
    # home plate
      hp.x <- c(-0.95, -0.95, 0, 0.95, 0.95)
      hp.y <- c(0, -0.475, -0.95, -0.475, 0)
      hp <- data.frame(hp.x, hp.y)
    # Join
      home.plate  <- geom_polygon(data = hp, aes(x = hp.x, y = hp.y), fill = NA, 
                                  color = "#ff6116", size = 0.5)
  # Hex plot
    # All Pitches
      ggplot(data = chris.sale.total, aes(x = px, y = pz)) + geom_hex() + coord_fixed() + 
        strike.zone + home.plate + scale_fill_viridis() + theme_fivethirtyeight()
**Plot 6:** Location of All Pitches

Plot 6: Location of All Pitches

    # facet by pitch type
      # Finding tally
        pitch.type.tally <- chris.sale.total %>% group_by(pitch_type) %>% tally()
      # plot
         ggplot(data = chris.sale.total, aes(x = px, y = pz)) + geom_hex() + coord_fixed() + 
           strike.zone + home.plate + scale_fill_viridis() + facet_grid(~pitch_type) + 
           theme_fivethirtyeight() + 
           geom_label(data = pitch.type.tally, aes(label=paste('n=', n)), 
                      x = -4.25, y = 5, vjust = 0, hjust = 0)
**Plot 7:** Location by Pitch Type

Plot 7: Location by Pitch Type

  # stand
    # stand and pitch type
      # Finding tally
         pitch.stand.type.tally <- chris.sale.total %>% group_by(pitch_type, stand) %>% tally()
      # plot location
        ggplot(data = chris.sale.total, aes(x = px, y = pz)) + geom_hex() + coord_fixed() + 
          strike.zone + home.plate + scale_fill_viridis() + facet_grid(stand~pitch_type) + 
          theme_fivethirtyeight() + 
          geom_label(data = pitch.stand.type.tally, aes(label=paste('n=', n)),
                     x = -4.25, y = 5, vjust = 0, hjust = 0)
**Plot 8:** Location by Pitch Type and Stand

Plot 8: Location by Pitch Type and Stand

  # formate
    stand.freq <- chris.sale.total %>% group_by(stand) %>% tally()
    type.stand.freq <- stand.freq %>% full_join(pitch.stand.type.tally, by = "stand")
    colnames(type.stand.freq) <- c("stand", "stand.total", "pitch_type", "stand_type_total")
    type.stand.freq$perc <- type.stand.freq$stand_type_total / type.stand.freq$stand.total

  # pitch type freqency plot
    ggplot(data = type.stand.freq, aes(x = pitch_type, y = perc)) + coord_flip() + 
      geom_bar(aes(fill = pitch_type), stat = "identity") + facet_grid(.~stand)  +
      theme_fivethirtyeight() + scale_fill_brewer(palette = 'Set1') + 
      scale_y_continuous(labels = scales::percent)
**Plot 9:** Freqcency of Pitches by Stand

Plot 9: Freqcency of Pitches by Stand

  # team
    # stand and pitch type
      # Finding tally
         pitch.team.type.tally <- chris.sale.total %>% group_by(pitch_type, team) %>% tally()
      # plot
        ggplot(data = chris.sale.total, aes(x = px, y = pz)) + geom_hex() + coord_fixed() + 
          strike.zone + home.plate + scale_fill_viridis() + facet_grid(team~pitch_type) + 
          theme_fivethirtyeight() + 
          geom_label(data = pitch.team.type.tally, aes(label=paste('n=', n)), 
                      x = -4.25, y = 5, vjust = 0, hjust = 0)
**Plot 10:** Location by Pitch Type and Team

Plot 10: Location by Pitch Type and Team

  # formate
    holder <- chris.sale.total %>% filter(year(date) == 2016 | year(date) == 2017)
    pitch.outcome.tally <- holder %>% group_by(pitch_outcome, year(date)) %>% tally()

  # plot
    ggplot(pitch.outcome.tally, aes(x = pitch_outcome, y = n)) + 
      geom_bar(aes(fill = pitch_outcome), stat = "identity") + coord_flip() + 
      facet_grid(.~ `year(date)`) + scale_fill_brewer(palette = 'Set3')  + theme_fivethirtyeight() + 
      geom_label(data = pitch.outcome.tally, aes(label=n))
**Plot 11:** Outcome by Year

Plot 11: Outcome by Year