#Run Set up to install packages

#Pick pitchers who throw 75 or more pitches per game

#gamedat <- get_payload(start =  "2018-03-29", end = "2018-09-30") #ran to get 2018 MLB season.
all_data = fread("PitchJOINAtBat.csv")
Pitchers75 = fread("qryAveragePitches75OrMore.csv")  #created this query and moved to .csv

#Choose 1 pitcher by a Random Generator ##generate a random sample from a discrete uniform distribution

rs <- ceiling(runif(1,-1,165))
#rs <- 96  #Kyle Gibson is in the 96th position in the vector.
reqPitchers <- unique(Pitchers75$pitcher_name)[rs]  #Pick a name from the pitchers who throw 75 pitches or more.
all_data %>% 
  select(V1,start_speed, des, date, pitcher_name, batter_name,count,type, pitch_type,
         event,b_height, p_throws,atbat_des,px,pz,stand,code,on_1b,on_2b,on_3b, nasty,o) %>%
  filter(pitch_type != "NA") %>%
  filter(pitcher_name %in% reqPitchers)->subPitches #Randomly chosen pitcher from pitchers who throw 75 pitches are more per game.

#Pitch Types by state of bases occupied chart
subPitches %>% 
  # head() %>% 
  filter(pitcher_name %in% reqPitchers) %>%   #reqPitchers
  #  filter(pitcher_name == "Aaron Nola") %>%
  mutate(state = paste0(as.integer(!is.na(on_1b)),'_',as.integer(!is.na(on_2b)),'_',as.integer(!is.na(on_3b)))) %>% 
  mutate(pitch_type = substr(pitch_type,1,2)) %>% 
  group_by(state, pitch_type) %>% 
  summarise(freq = n()) %>% 
  group_by(state) %>% 
  mutate(perc_events = freq/sum(freq)) %>% 
  ggplot(aes( x= state, y = perc_events, fill = pitch_type)) + 
  geom_bar(stat = 'identity',position = 'dodge') +
  labs(title = paste("On-Base State vs Pitch Types-", reqPitchers[1],"Ave. Speed",round(mean(subPitches$start_speed), digits=0),"Throws",subPitches$p_throws))

# splitting by the number of balls and Pitch Type
subPitches %>% 
  mutate(pitch_type =substr(pitch_type,1,2)) %>%   #can use substring (1,1) to group by first letter
  filter(pitcher_name %in% reqPitchers) %>% 
  #filter(pitcher_name == 'Aaron Nola') %>%  
  group_by(count, pitch_type ) %>% 
  summarise(freq = n()) %>% 
  group_by(count) %>% 
  mutate(perc_pitches = freq/sum(freq),
         ball_count = substr(count,1,2)) %>% 
  # head() %>% %>% 
  ggplot(aes(x = count, y = perc_pitches, fill = pitch_type))+
  geom_bar(stat = 'identity', position = 'dodge')+
  facet_wrap(~ball_count, scale = 'free') +
  labs(title = paste("Count Situations vs Pitch Type-", reqPitchers[1],"Ave. Speed",round(mean(subPitches$start_speed), digits=0), "Throws",subPitches$p_throws))

#Look at Pitch Chart visuals For Home Run Pitch Type
subPitches %>%
    filter(subPitches$type=='X') %>%
      filter(event=='Home Run') -> batterHITHR

ggplot() +
  geom_point(data=batterHITHR, aes(x=px, y=pz, shape=type, col=pitch_type)) +
  coord_equal() + geom_path(aes(x, y), data = mlbgameday::kzone) +
  labs(title = paste("Home Run Pitch Type and Location", reqPitchers[1],"Throws",batterHITHR$p_throws))

# Lefty or righty batting stance
ggplot() +
  geom_point(data=batterHITHR, aes(x=px, y=pz, shape=type, col=pitch_type)) +
  facet_grid(. ~ stand) + coord_equal() +
  geom_path(aes(x, y), data =  mlbgameday::kzone) +
  labs(title = paste("Home Runs against", reqPitchers[1],"Throws",batterHITHR$p_throws))

#Hypothesis -> Do pitchers who throw in the top quartile of start_speed rely on their fastball in tough game situations? # Null Hypothesis - Ho - A fastball pitcher does not rely on Pitch Type Sequence in tough game situations. The other 90% do not rely on the fastball in tough situations. # Alternative Hypothesis - Ha - All pitchers follow similar pitch sequence no matter what their start speed is. #Fastball pitchers in the top quartile pitch differently, it also depends on number of pitch types they can master.

nmesINPITCH_75 <- unique(Pitchers75$pitcher_name)
all_data %>% 
  select(V1,start_speed, date, pitcher_name, batter_name,count,type, pitch_type,
         event,b_height, p_throws,atbat_des,px,pz,stand,code,on_1b,on_2b,on_3b, nasty) %>%
  filter(pitch_type != "NA") %>%
  filter(pitcher_name %in% nmesINPITCH_75)->PitcherAnalysis   #pitcher names who pitch 75 or more pitches per game

PitcherAnalysis %>% 
  group_by(pitcher_name ) %>% 
  summarise(mean_start_speed =mean(start_speed, na.rm = TRUE) ) ->allPitchers75ormore
summary(allPitchers75ormore)
##  pitcher_name       mean_start_speed
##  Length:140         Min.   :81.73   
##  Class :character   1st Qu.:86.04   
##  Mode  :character   Median :88.08   
##                     Mean   :87.92   
##                     3rd Qu.:90.00   
##                     Max.   :93.74

Top Quartile analysis

#generate a random sample from a discrete uniform distribution

PitcherAnalysis %>% 
  filter(pitcher_name %in% nmesINPITCH_75)%>%
  filter(start_speed>90) %>% 
  group_by(pitcher_name ) %>% 
  summarise(mean_start_speed =mean(start_speed, na.rm = TRUE) ) ->Pitchers3rdQuartile
summary(Pitchers3rdQuartile)
##  pitcher_name       mean_start_speed
##  Length:136         Min.   :90.32   
##  Class :character   1st Qu.:91.15   
##  Mode  :character   Median :92.23   
##                     Mean   :92.57   
##                     3rd Qu.:93.66   
##                     Max.   :96.99
#Choose pitcher name from Pitchers3rdQuartile
#Run Random sample through charts below.  Push into subPitches.
rs <- ceiling(runif(1,-1,136))
nmesINPITCH_3rdQuartile <- unique(Pitchers3rdQuartile$pitcher_name)[rs]  #Pick a name from the pitchers who throw 75 pitches or more.

all_data %>% 
  select(V1,start_speed, des, date, pitcher_name, batter_name,count,type, pitch_type,
         event,b_height, p_throws,atbat_des,px,pz,stand,code,on_1b,on_2b,on_3b, nasty,o) %>%
  filter(pitch_type != "NA") %>%
  filter(pitcher_name %in% nmesINPITCH_3rdQuartile) ->subPitchesTop #Randomly chosen pitcher from pitchers who throw 75 pitches are more per game.

#Pitch Types by state of bases occupied chart
subPitchesTop %>% 
  # head() %>% 
  filter(pitcher_name %in% nmesINPITCH_3rdQuartile) %>%   #reqPitchers
  #  filter(pitcher_name == "Aaron Nola") %>%
  mutate(state = paste0(as.integer(!is.na(on_1b)),'_',as.integer(!is.na(on_2b)),'_',as.integer(!is.na(on_3b)))) %>% 
  mutate(pitch_type = substr(pitch_type,1,2)) %>% 
  group_by(state, pitch_type) %>% 
  summarise(freq = n()) %>% 
  group_by(state) %>% 
  mutate(perc_events = freq/sum(freq)) %>% 
  ggplot(aes( x= state, y = perc_events, fill = pitch_type)) + 
  geom_bar(stat = 'identity',position = 'dodge') +
  labs(title = paste("Top Quart. Pitcher Speed vs Pitch Types-", nmesINPITCH_3rdQuartile[1],"Ave. Fastball",round(mean(Pitchers3rdQuartile$mean_start_speed), digits=0)))

# splitting by the number of balls and Pitch Type
subPitchesTop %>% 
  mutate(pitch_type =substr(pitch_type,1,2)) %>% 
  filter(pitcher_name %in% nmesINPITCH_3rdQuartile) %>% 
  group_by(count, pitch_type ) %>% 
  summarise(freq = n()) %>% 
  group_by(count) %>% 
  mutate(perc_pitches = freq/sum(freq),
         ball_count = substr(count,1,2)) %>% 
  # head() %>% %>% 
  ggplot(aes(x = count, y = perc_pitches, fill = pitch_type))+
  geom_bar(stat = 'identity', position = 'dodge')+
  facet_wrap(~ball_count, scale = 'free') +
  labs(title = paste("Top Quart. Pitcher Speed vs Pitch Types-", nmesINPITCH_3rdQuartile[1],"Ave. Fastball",round(mean(Pitchers3rdQuartile$mean_start_speed), digits=0)))

#Look at Pitch Chart Visuals

#Look at Pitch Chart visuals
subPitchesTop %>%
  filter(subPitchesTop$type=='X') %>%
  filter(event=='Home Run') -> batterHITHR

ggplot() +
  geom_point(data=batterHITHR, aes(x=px, y=pz, shape=type, col=pitch_type)) +
  coord_equal() + geom_path(aes(x, y), data = mlbgameday::kzone) +
  labs(title = paste("Home Runs against", nmesINPITCH_3rdQuartile[1],"Throws",batterHITHR$p_throws))

# Lefty or righty batting stance
ggplot() +
  geom_point(data=batterHITHR, aes(x=px, y=pz, shape=type, col=pitch_type)) +
  facet_grid(. ~ stand) + coord_equal() +
  geom_path(aes(x, y), data =  mlbgameday::kzone) +
  labs(title = paste("Home Runs for L and R batters  against", nmesINPITCH_3rdQuartile[1],"Throws",batterHITHR$p_throws))

       #,"Run Expectancy Situation", "Count Situation"))
#All Pitches Left and Righty
ggplot() +
  geom_point(data=subPitchesTop, aes(x=px, y=pz, shape=type, col=pitch_type)) +
  facet_grid(. ~ stand) + coord_equal() +
  geom_path(aes(x, y), data =  mlbgameday::kzone) +
  labs(title = paste("All Pitches for L and R batters  against", nmesINPITCH_3rdQuartile[1],"Throws",batterHITHR$p_throws))

       #,"Run Expectancy Situation", "Count Situation"))

Middle Quartile Analysis

#Set up Middle Quartile information
PitcherAnalysis %>% 
  filter(pitcher_name %in% nmesINPITCH_75)%>%
  filter(start_speed<=90) %>% 
  filter(start_speed>86) %>%
  group_by(pitcher_name ) %>% 
  summarise(mean_start_speed =mean(start_speed, na.rm = TRUE) ) ->PitchersMiddleQuartile
summary(PitchersMiddleQuartile)
##  pitcher_name       mean_start_speed
##  Length:140         Min.   :86.82   
##  Class :character   1st Qu.:87.88   
##  Mode  :character   Median :88.20   
##                     Mean   :88.21   
##                     3rd Qu.:88.58   
##                     Max.   :89.56
#Choose pitcher name from Pitchers3rdQuartile
#Run Random sample through charts below.  Push into subPitches.
rs <- ceiling(runif(1,-1,140))
nmesINPITCH_MiddleQuartile <- unique(PitchersMiddleQuartile$pitcher_name)[rs]  #Pick a name from the pitchers who throw 75 pitches or more.

all_data %>% 
  select(V1,start_speed, des, date, pitcher_name, batter_name,count,type, pitch_type,
         event,b_height, p_throws,atbat_des,px,pz,stand,code,on_1b,on_2b,on_3b, nasty,o) %>%
  filter(pitch_type != "NA") %>%
  filter(pitcher_name %in% nmesINPITCH_MiddleQuartile) ->subPitchesMiddle
#Pitch Types by state of bases occupied chart
subPitchesMiddle %>% 
  # head() %>% 
  filter(pitcher_name %in% nmesINPITCH_MiddleQuartile) %>%   #reqPitchers
  mutate(state = paste0(as.integer(!is.na(on_1b)),'_',as.integer(!is.na(on_2b)),'_',as.integer(!is.na(on_3b)))) %>% 
  mutate(pitch_type = substr(pitch_type,1,2)) %>% 
  group_by(state, pitch_type) %>% 
  summarise(freq = n()) %>% 
  group_by(state) %>% 
  mutate(perc_events = freq/sum(freq)) %>% 
  ggplot(aes( x= state, y = perc_events, fill = pitch_type)) + 
  geom_bar(stat = 'identity',position = 'dodge') +
  labs(title = paste("Mid Quart. On-Base vs Pitch Types against", nmesINPITCH_MiddleQuartile[1],"Ave. Fastball",round(mean(subPitchesMiddle$start_speed),digits=0)))

# splitting by the number of balls and Pitch Type
subPitchesMiddle %>% 
  mutate(pitch_type =substr(pitch_type,1,2)) %>% 
  filter(pitcher_name %in% nmesINPITCH_MiddleQuartile) %>% 
  #filter(pitcher_name == 'Aaron Nola') %>%  
  group_by(count, pitch_type ) %>% 
  summarise(freq = n()) %>% 
  group_by(count) %>% 
  mutate(perc_pitches = freq/sum(freq),
         ball_count = substr(count,1,2)) %>% 
  # head() %>% %>% 
  ggplot(aes(x = count, y = perc_pitches, fill = pitch_type))+
  geom_bar(stat = 'identity', position = 'dodge')+
  facet_wrap(~ball_count, scale = 'free') +
  labs(title = paste("Mid Quart. Count vs Pitch Types against", nmesINPITCH_MiddleQuartile[1],"Ave. Fastball",round(mean(subPitchesMiddle$start_speed),digits=0)))

#Look at Pitch Chart visuals
subPitchesMiddle %>%
  filter(subPitchesMiddle$type=='X') %>%
  filter(event=='Home Run') -> batterHITHR

ggplot() +
  geom_point(data=batterHITHR, aes(x=px, y=pz, shape=type, col=pitch_type)) +
  coord_equal() + geom_path(aes(x, y), data = mlbgameday::kzone) +
  labs(title = paste("Home Runs  against", nmesINPITCH_MiddleQuartile[1],"Throws",batterHITHR$p_throws))

# Lefty or righty batting stance
ggplot() +
  geom_point(data=batterHITHR, aes(x=px, y=pz, shape=type, col=pitch_type)) +
  facet_grid(. ~ stand) + coord_equal() +
  geom_path(aes(x, y), data =  mlbgameday::kzone) +
  labs(title = paste("Home Runs for L and R batters against", nmesINPITCH_MiddleQuartile[1],"Throws",batterHITHR$p_throws))

#All Pitches Left and Righty
ggplot() +
  geom_point(data=subPitchesMiddle, aes(x=px, y=pz, shape=type, col=pitch_type)) +
  facet_grid(. ~ stand) + coord_equal() +
  geom_path(aes(x, y), data =  mlbgameday::kzone) +
  labs(title = paste("All Pitches for L and R batters  against", nmesINPITCH_MiddleQuartile[1],"Throws",batterHITHR$p_throws))

#subPitchesMiddle #subPitches #subPitchesTop

subPitches %>%
  filter(des=="Swinging Strike") -> WhiffPitchTypes

      WhiffPitchTypes %>% 
              # head() %>% 
              filter(pitcher_name %in% reqPitchers) %>%   #reqPitchers
              mutate(state = paste0(as.integer(!is.na(on_1b)),'_',as.integer(!is.na(on_2b)),'_',as.integer(!is.na(on_3b)))) %>% 
              mutate(pitch_type = substr(pitch_type,1,2)) %>% 
              group_by(state, pitch_type) %>% 
              summarise(freq = n()) %>% 
              group_by(state) %>% 
              mutate(perc_events = freq/sum(freq)) %>% 
              ggplot(aes( x= state, y = perc_events, fill = pitch_type)) + 
              geom_bar(stat = 'identity',position = 'dodge') +
              labs(title = paste("WHIFF's vs Pitch Types-", reqPitchers[1],"Ave. Speed",round(mean(subPitches$start_speed), digits=0),"Throws",batterHITHR$p_throws))

            # splitting by the number of balls and Pitch Type
      WhiffPitchTypes %>% 
              mutate(pitch_type =substr(pitch_type,1,2)) %>% 
              filter(pitcher_name %in% reqPitchers) %>% 
              group_by(count, pitch_type ) %>% 
              summarise(freq = n()) %>% 
              group_by(count) %>% 
              mutate(perc_pitches = freq/sum(freq),
                     ball_count = substr(count,1,2)) %>% 
              ggplot(aes(x = count, y = perc_pitches, fill = pitch_type))+
              geom_bar(stat = 'identity', position = 'dodge')+
              facet_wrap(~ball_count, scale = 'free') +
              labs(title = paste("Count Situations - WHIFF vs Pitch Types", reqPitchers[1],"Ave. Speed",round(mean(subPitches$start_speed), digits=0),"Throws",batterHITHR$p_throws))

      ggplot() +
        geom_point(data=WhiffPitchTypes, aes(x=px, y=pz, shape=type, col=pitch_type)) +
        coord_equal() + geom_path(aes(x, y), data = mlbgameday::kzone) +
        labs(title = paste("WHIFF vs Pitch Types", reqPitchers[1],"Ave. Speed",round(mean(subPitches$start_speed), digits=0),"Throws",batterHITHR$p_throws))

      # Lefty or righty batting stance
      ggplot() +
        geom_point(data=WhiffPitchTypes, aes(x=px, y=pz, shape=type, col=pitch_type)) +
        facet_grid(. ~ stand) + coord_equal() +
        geom_path(aes(x, y), data =  mlbgameday::kzone) +
        labs(title = paste("WHIFF vs Pitch Types", reqPitchers[1],"Ave. Speed",round(mean(subPitches$start_speed), digits=0),"Throws",batterHITHR$p_throws))

#which were strike outs
      WhiffPitchTypes %>%
        filter(event=="Strikeout") -> strikeoutWHIFF_PitchTypes
      
      strikeoutWHIFF_PitchTypes %>% 
        # head() %>% 
        filter(pitcher_name %in% reqPitchers) %>%   #reqPitchers
        #  filter(pitcher_name == "Aaron Nola") %>%
        mutate(state = paste0(as.integer(!is.na(on_1b)),'_',as.integer(!is.na(on_2b)),'_',as.integer(!is.na(on_3b)))) %>% 
        mutate(pitch_type = substr(pitch_type,1,2)) %>% 
        group_by(state, pitch_type) %>% 
        summarise(freq = n()) %>% 
        group_by(state) %>% 
        mutate(perc_events = freq/sum(freq)) %>% 
        ggplot(aes( x= state, y = perc_events, fill = pitch_type)) + 
        geom_bar(stat = 'identity',position = 'dodge') +
        labs(title = paste("Strikeout WHIFFs vs Pitch Types", reqPitchers[1],"Ave. Speed",round(mean(subPitches$start_speed), digits=0),"Throws",strikeoutWHIFF_PitchTypes$p_throws))

     # splitting by the number of balls and Pitch Type
      strikeoutWHIFF_PitchTypes %>% 
              mutate(pitch_type =substr(pitch_type,1,2)) %>% 
              filter(pitcher_name %in% reqPitchers) %>% 
              group_by(count, pitch_type ) %>% 
              summarise(freq = n()) %>% 
              group_by(count) %>% 
              mutate(perc_pitches = freq/sum(freq),
                     ball_count = substr(count,1,2)) %>% 
              ggplot(aes(x = count, y = perc_pitches, fill = pitch_type))+
              geom_bar(stat = 'identity', position = 'dodge')+
              facet_wrap(~ball_count, scale = 'free')+
        labs(title = paste("Strikeout WHIFFs vs Pitch Types", reqPitchers[1],"Ave. Speed",round(mean(subPitches$start_speed), digits=0),"Throws",strikeoutWHIFF_PitchTypes$p_throws))

      ggplot() +
        geom_point(data=strikeoutWHIFF_PitchTypes, aes(x=px, y=pz, shape=type, col=pitch_type)) +
        coord_equal() + geom_path(aes(x, y), data = mlbgameday::kzone) +
        labs(title = paste("Strikeout WHIFFs vs Pitch Types", reqPitchers[1],"Ave. Speed",round(mean(subPitches$start_speed), digits=0),"Throws",strikeoutWHIFF_PitchTypes$p_throws))

      # Lefty or righty batting stance
      ggplot() +
        geom_point(data=strikeoutWHIFF_PitchTypes, aes(x=px, y=pz, shape=type, col=pitch_type)) +
        facet_grid(. ~ stand) + coord_equal() +
        geom_path(aes(x, y), data =  mlbgameday::kzone) +
        labs(title = paste("Strikeout WHIFFs vs Pitch Types", reqPitchers[1],"Ave. Speed",round(mean(subPitches$start_speed), digits=0),"Throws",strikeoutWHIFF_PitchTypes$p_throws))

      #strikeoutWHIFF_PitchTypes with Runner on 3rd.  What pitch_type is thrown.
      strikeoutWHIFF_PitchTypes %>% 
        filter(!is.na(on_3b)) -> PitchType_RunnerThirdWhiff
      
      ggplot() +
        geom_point(data=PitchType_RunnerThirdWhiff, aes(x=px, y=pz, shape=type, col=pitch_type)) +
        coord_equal() + geom_path(aes(x, y), data = mlbgameday::kzone) +
        labs(title = paste("WHIFF pitch type - runner on third", reqPitchers[1],"Throws",PitchType_RunnerThirdWhiff$p_throws))

      ggplot() +
        geom_point(data=PitchType_RunnerThirdWhiff, aes(x=px, y=pz, shape=type, col=pitch_type)) +
        facet_grid(. ~ stand) + coord_equal() +
        geom_path(aes(x, y), data =  mlbgameday::kzone) +
        labs(title = paste("WHIFF pitch type - runner on third", reqPitchers[1],"Throws",PitchType_RunnerThirdWhiff$p_throws))

#Fastball and Offspeed for different Count States #Examine Data. These are the first analysis completed to drill into the data

reqPitchers = unique(all_data$pitcher_name)[1:165]  #All 165 pitchers to vector
all_data %>%
  filter(pitch_type != "NA") %>%
    filter(pitcher_name %in% reqPitchers)   ->FastBallAnalyis #Filtered for pitchers throwing 75 or more pitches per game, both fastball and off-speed. -> 273724 observations

FastballTotal <- FastBallAnalyis %>%    #All Fastballs
  filter(pitch_type  == "FC"| pitch_type == "FF" | pitch_type == "FS" | pitch_type == "FT" ) 

OffSpeedTotal <- FastBallAnalyis %>%    #All Off-speed
  filter(pitch_type  == "CH"| pitch_type == "CU" | pitch_type == "EP" | pitch_type == "KC"| pitch_type == "KN" | pitch_type == "SC" | pitch_type == "SI"| pitch_type == "SL")

FastBall_PitcherFavState <- FastballTotal  %>%
  filter(count == "2-2" | count == "1-2" | count == "0-2" | count == "0-1" | count == "1-1")
 
FastBall_BatterFavState <- FastballTotal  %>%
  filter(count == "2-1" | count == "3-1" | count == "3-2" |count == "1-0" | count == "2-0" | count == "3-0")


OffSpeed_PitcherFavState <- OffSpeedTotal  %>%
  filter(count == "2-2" | count == "1-2" | count == "0-2" | count == "0-1" | count == "1-1")
 
OffSpeed_BatterFavState <- OffSpeedTotal  %>%
  filter(count == "2-1" | count == "3-1" | count == "3-2" |count == "1-0" | count == "2-0" | count == "3-0")

FirstPitchFastball <-FastballTotal %>%    
  filter(count == "0-0")

FirstPitchOffspeed <-OffSpeedTotal %>%    
  filter(count == "0-0")

#plot relevant information

dataforplot = FastBallAnalyis %>% 
  select(count, pitch_type) %>% 
  mutate(pitch_type2 = ifelse(substr(pitch_type, 1,1)=='F','FastBAll',"offSpeed"),
         situation = ifelse(count %in% c("2-2" ,"1-2" ,"0-2" ,"0-1" ,"1-1"),'favPitcher',
                            ifelse(count== "0-0",'firstPitch','favBatter')))
# percentage of fast and offspeed balls in different situations.
datatemp = dataforplot %>% 
  group_by(pitch_type2, situation) %>% 
  summarise(count = n()) %>% 
  # filter(pitch_type2 =='FastBAll') %>% 
  filter(situation != 'firstPitch') %>% 
  group_by(pitch_type2) %>% 
  mutate(total = sum(count),
         perc = count/total)

datatemp %>% 
  ggplot(aes(x = situation, y = perc, fill = situation))+
  geom_bar(stat = 'identity')+
  facet_wrap(~pitch_type2)+ 
  scale_y_continuous(labels = percent)+
  labs(title = paste("Fastball vs Off-speed when count is favorable and unfavorable"))