#Run Set up to install packages
#Pick pitchers who throw 75 or more pitches per game
#gamedat <- get_payload(start = "2018-03-29", end = "2018-09-30") #ran to get 2018 MLB season.
all_data = fread("PitchJOINAtBat.csv")
Pitchers75 = fread("qryAveragePitches75OrMore.csv") #created this query and moved to .csv
#Choose 1 pitcher by a Random Generator ##generate a random sample from a discrete uniform distribution
rs <- ceiling(runif(1,-1,165))
#rs <- 96 #Kyle Gibson is in the 96th position in the vector.
reqPitchers <- unique(Pitchers75$pitcher_name)[rs] #Pick a name from the pitchers who throw 75 pitches or more.
all_data %>%
select(V1,start_speed, des, date, pitcher_name, batter_name,count,type, pitch_type,
event,b_height, p_throws,atbat_des,px,pz,stand,code,on_1b,on_2b,on_3b, nasty,o) %>%
filter(pitch_type != "NA") %>%
filter(pitcher_name %in% reqPitchers)->subPitches #Randomly chosen pitcher from pitchers who throw 75 pitches are more per game.
#Pitch Types by state of bases occupied chart
subPitches %>%
# head() %>%
filter(pitcher_name %in% reqPitchers) %>% #reqPitchers
# filter(pitcher_name == "Aaron Nola") %>%
mutate(state = paste0(as.integer(!is.na(on_1b)),'_',as.integer(!is.na(on_2b)),'_',as.integer(!is.na(on_3b)))) %>%
mutate(pitch_type = substr(pitch_type,1,2)) %>%
group_by(state, pitch_type) %>%
summarise(freq = n()) %>%
group_by(state) %>%
mutate(perc_events = freq/sum(freq)) %>%
ggplot(aes( x= state, y = perc_events, fill = pitch_type)) +
geom_bar(stat = 'identity',position = 'dodge') +
labs(title = paste("On-Base State vs Pitch Types-", reqPitchers[1],"Ave. Speed",round(mean(subPitches$start_speed), digits=0),"Throws",subPitches$p_throws))
# splitting by the number of balls and Pitch Type
subPitches %>%
mutate(pitch_type =substr(pitch_type,1,2)) %>% #can use substring (1,1) to group by first letter
filter(pitcher_name %in% reqPitchers) %>%
#filter(pitcher_name == 'Aaron Nola') %>%
group_by(count, pitch_type ) %>%
summarise(freq = n()) %>%
group_by(count) %>%
mutate(perc_pitches = freq/sum(freq),
ball_count = substr(count,1,2)) %>%
# head() %>% %>%
ggplot(aes(x = count, y = perc_pitches, fill = pitch_type))+
geom_bar(stat = 'identity', position = 'dodge')+
facet_wrap(~ball_count, scale = 'free') +
labs(title = paste("Count Situations vs Pitch Type-", reqPitchers[1],"Ave. Speed",round(mean(subPitches$start_speed), digits=0), "Throws",subPitches$p_throws))
#Look at Pitch Chart visuals For Home Run Pitch Type
subPitches %>%
filter(subPitches$type=='X') %>%
filter(event=='Home Run') -> batterHITHR
ggplot() +
geom_point(data=batterHITHR, aes(x=px, y=pz, shape=type, col=pitch_type)) +
coord_equal() + geom_path(aes(x, y), data = mlbgameday::kzone) +
labs(title = paste("Home Run Pitch Type and Location", reqPitchers[1],"Throws",batterHITHR$p_throws))
# Lefty or righty batting stance
ggplot() +
geom_point(data=batterHITHR, aes(x=px, y=pz, shape=type, col=pitch_type)) +
facet_grid(. ~ stand) + coord_equal() +
geom_path(aes(x, y), data = mlbgameday::kzone) +
labs(title = paste("Home Runs against", reqPitchers[1],"Throws",batterHITHR$p_throws))
#Hypothesis -> Do pitchers who throw in the top quartile of start_speed rely on their fastball in tough game situations? # Null Hypothesis - Ho - A fastball pitcher does not rely on Pitch Type Sequence in tough game situations. The other 90% do not rely on the fastball in tough situations. # Alternative Hypothesis - Ha - All pitchers follow similar pitch sequence no matter what their start speed is. #Fastball pitchers in the top quartile pitch differently, it also depends on number of pitch types they can master.
nmesINPITCH_75 <- unique(Pitchers75$pitcher_name)
all_data %>%
select(V1,start_speed, date, pitcher_name, batter_name,count,type, pitch_type,
event,b_height, p_throws,atbat_des,px,pz,stand,code,on_1b,on_2b,on_3b, nasty) %>%
filter(pitch_type != "NA") %>%
filter(pitcher_name %in% nmesINPITCH_75)->PitcherAnalysis #pitcher names who pitch 75 or more pitches per game
PitcherAnalysis %>%
group_by(pitcher_name ) %>%
summarise(mean_start_speed =mean(start_speed, na.rm = TRUE) ) ->allPitchers75ormore
summary(allPitchers75ormore)
## pitcher_name mean_start_speed
## Length:140 Min. :81.73
## Class :character 1st Qu.:86.04
## Mode :character Median :88.08
## Mean :87.92
## 3rd Qu.:90.00
## Max. :93.74
#generate a random sample from a discrete uniform distribution
PitcherAnalysis %>%
filter(pitcher_name %in% nmesINPITCH_75)%>%
filter(start_speed>90) %>%
group_by(pitcher_name ) %>%
summarise(mean_start_speed =mean(start_speed, na.rm = TRUE) ) ->Pitchers3rdQuartile
summary(Pitchers3rdQuartile)
## pitcher_name mean_start_speed
## Length:136 Min. :90.32
## Class :character 1st Qu.:91.15
## Mode :character Median :92.23
## Mean :92.57
## 3rd Qu.:93.66
## Max. :96.99
#Choose pitcher name from Pitchers3rdQuartile
#Run Random sample through charts below. Push into subPitches.
rs <- ceiling(runif(1,-1,136))
nmesINPITCH_3rdQuartile <- unique(Pitchers3rdQuartile$pitcher_name)[rs] #Pick a name from the pitchers who throw 75 pitches or more.
all_data %>%
select(V1,start_speed, des, date, pitcher_name, batter_name,count,type, pitch_type,
event,b_height, p_throws,atbat_des,px,pz,stand,code,on_1b,on_2b,on_3b, nasty,o) %>%
filter(pitch_type != "NA") %>%
filter(pitcher_name %in% nmesINPITCH_3rdQuartile) ->subPitchesTop #Randomly chosen pitcher from pitchers who throw 75 pitches are more per game.
#Pitch Types by state of bases occupied chart
subPitchesTop %>%
# head() %>%
filter(pitcher_name %in% nmesINPITCH_3rdQuartile) %>% #reqPitchers
# filter(pitcher_name == "Aaron Nola") %>%
mutate(state = paste0(as.integer(!is.na(on_1b)),'_',as.integer(!is.na(on_2b)),'_',as.integer(!is.na(on_3b)))) %>%
mutate(pitch_type = substr(pitch_type,1,2)) %>%
group_by(state, pitch_type) %>%
summarise(freq = n()) %>%
group_by(state) %>%
mutate(perc_events = freq/sum(freq)) %>%
ggplot(aes( x= state, y = perc_events, fill = pitch_type)) +
geom_bar(stat = 'identity',position = 'dodge') +
labs(title = paste("Top Quart. Pitcher Speed vs Pitch Types-", nmesINPITCH_3rdQuartile[1],"Ave. Fastball",round(mean(Pitchers3rdQuartile$mean_start_speed), digits=0)))
# splitting by the number of balls and Pitch Type
subPitchesTop %>%
mutate(pitch_type =substr(pitch_type,1,2)) %>%
filter(pitcher_name %in% nmesINPITCH_3rdQuartile) %>%
group_by(count, pitch_type ) %>%
summarise(freq = n()) %>%
group_by(count) %>%
mutate(perc_pitches = freq/sum(freq),
ball_count = substr(count,1,2)) %>%
# head() %>% %>%
ggplot(aes(x = count, y = perc_pitches, fill = pitch_type))+
geom_bar(stat = 'identity', position = 'dodge')+
facet_wrap(~ball_count, scale = 'free') +
labs(title = paste("Top Quart. Pitcher Speed vs Pitch Types-", nmesINPITCH_3rdQuartile[1],"Ave. Fastball",round(mean(Pitchers3rdQuartile$mean_start_speed), digits=0)))
#Look at Pitch Chart Visuals
#Look at Pitch Chart visuals
subPitchesTop %>%
filter(subPitchesTop$type=='X') %>%
filter(event=='Home Run') -> batterHITHR
ggplot() +
geom_point(data=batterHITHR, aes(x=px, y=pz, shape=type, col=pitch_type)) +
coord_equal() + geom_path(aes(x, y), data = mlbgameday::kzone) +
labs(title = paste("Home Runs against", nmesINPITCH_3rdQuartile[1],"Throws",batterHITHR$p_throws))
# Lefty or righty batting stance
ggplot() +
geom_point(data=batterHITHR, aes(x=px, y=pz, shape=type, col=pitch_type)) +
facet_grid(. ~ stand) + coord_equal() +
geom_path(aes(x, y), data = mlbgameday::kzone) +
labs(title = paste("Home Runs for L and R batters against", nmesINPITCH_3rdQuartile[1],"Throws",batterHITHR$p_throws))
#,"Run Expectancy Situation", "Count Situation"))
#All Pitches Left and Righty
ggplot() +
geom_point(data=subPitchesTop, aes(x=px, y=pz, shape=type, col=pitch_type)) +
facet_grid(. ~ stand) + coord_equal() +
geom_path(aes(x, y), data = mlbgameday::kzone) +
labs(title = paste("All Pitches for L and R batters against", nmesINPITCH_3rdQuartile[1],"Throws",batterHITHR$p_throws))
#,"Run Expectancy Situation", "Count Situation"))
#Set up Middle Quartile information
PitcherAnalysis %>%
filter(pitcher_name %in% nmesINPITCH_75)%>%
filter(start_speed<=90) %>%
filter(start_speed>86) %>%
group_by(pitcher_name ) %>%
summarise(mean_start_speed =mean(start_speed, na.rm = TRUE) ) ->PitchersMiddleQuartile
summary(PitchersMiddleQuartile)
## pitcher_name mean_start_speed
## Length:140 Min. :86.82
## Class :character 1st Qu.:87.88
## Mode :character Median :88.20
## Mean :88.21
## 3rd Qu.:88.58
## Max. :89.56
#Choose pitcher name from Pitchers3rdQuartile
#Run Random sample through charts below. Push into subPitches.
rs <- ceiling(runif(1,-1,140))
nmesINPITCH_MiddleQuartile <- unique(PitchersMiddleQuartile$pitcher_name)[rs] #Pick a name from the pitchers who throw 75 pitches or more.
all_data %>%
select(V1,start_speed, des, date, pitcher_name, batter_name,count,type, pitch_type,
event,b_height, p_throws,atbat_des,px,pz,stand,code,on_1b,on_2b,on_3b, nasty,o) %>%
filter(pitch_type != "NA") %>%
filter(pitcher_name %in% nmesINPITCH_MiddleQuartile) ->subPitchesMiddle
#Pitch Types by state of bases occupied chart
subPitchesMiddle %>%
# head() %>%
filter(pitcher_name %in% nmesINPITCH_MiddleQuartile) %>% #reqPitchers
mutate(state = paste0(as.integer(!is.na(on_1b)),'_',as.integer(!is.na(on_2b)),'_',as.integer(!is.na(on_3b)))) %>%
mutate(pitch_type = substr(pitch_type,1,2)) %>%
group_by(state, pitch_type) %>%
summarise(freq = n()) %>%
group_by(state) %>%
mutate(perc_events = freq/sum(freq)) %>%
ggplot(aes( x= state, y = perc_events, fill = pitch_type)) +
geom_bar(stat = 'identity',position = 'dodge') +
labs(title = paste("Mid Quart. On-Base vs Pitch Types against", nmesINPITCH_MiddleQuartile[1],"Ave. Fastball",round(mean(subPitchesMiddle$start_speed),digits=0)))
# splitting by the number of balls and Pitch Type
subPitchesMiddle %>%
mutate(pitch_type =substr(pitch_type,1,2)) %>%
filter(pitcher_name %in% nmesINPITCH_MiddleQuartile) %>%
#filter(pitcher_name == 'Aaron Nola') %>%
group_by(count, pitch_type ) %>%
summarise(freq = n()) %>%
group_by(count) %>%
mutate(perc_pitches = freq/sum(freq),
ball_count = substr(count,1,2)) %>%
# head() %>% %>%
ggplot(aes(x = count, y = perc_pitches, fill = pitch_type))+
geom_bar(stat = 'identity', position = 'dodge')+
facet_wrap(~ball_count, scale = 'free') +
labs(title = paste("Mid Quart. Count vs Pitch Types against", nmesINPITCH_MiddleQuartile[1],"Ave. Fastball",round(mean(subPitchesMiddle$start_speed),digits=0)))
#Look at Pitch Chart visuals
subPitchesMiddle %>%
filter(subPitchesMiddle$type=='X') %>%
filter(event=='Home Run') -> batterHITHR
ggplot() +
geom_point(data=batterHITHR, aes(x=px, y=pz, shape=type, col=pitch_type)) +
coord_equal() + geom_path(aes(x, y), data = mlbgameday::kzone) +
labs(title = paste("Home Runs against", nmesINPITCH_MiddleQuartile[1],"Throws",batterHITHR$p_throws))
# Lefty or righty batting stance
ggplot() +
geom_point(data=batterHITHR, aes(x=px, y=pz, shape=type, col=pitch_type)) +
facet_grid(. ~ stand) + coord_equal() +
geom_path(aes(x, y), data = mlbgameday::kzone) +
labs(title = paste("Home Runs for L and R batters against", nmesINPITCH_MiddleQuartile[1],"Throws",batterHITHR$p_throws))
#All Pitches Left and Righty
ggplot() +
geom_point(data=subPitchesMiddle, aes(x=px, y=pz, shape=type, col=pitch_type)) +
facet_grid(. ~ stand) + coord_equal() +
geom_path(aes(x, y), data = mlbgameday::kzone) +
labs(title = paste("All Pitches for L and R batters against", nmesINPITCH_MiddleQuartile[1],"Throws",batterHITHR$p_throws))
#subPitchesMiddle #subPitches #subPitchesTop
subPitches %>%
filter(des=="Swinging Strike") -> WhiffPitchTypes
WhiffPitchTypes %>%
# head() %>%
filter(pitcher_name %in% reqPitchers) %>% #reqPitchers
mutate(state = paste0(as.integer(!is.na(on_1b)),'_',as.integer(!is.na(on_2b)),'_',as.integer(!is.na(on_3b)))) %>%
mutate(pitch_type = substr(pitch_type,1,2)) %>%
group_by(state, pitch_type) %>%
summarise(freq = n()) %>%
group_by(state) %>%
mutate(perc_events = freq/sum(freq)) %>%
ggplot(aes( x= state, y = perc_events, fill = pitch_type)) +
geom_bar(stat = 'identity',position = 'dodge') +
labs(title = paste("WHIFF's vs Pitch Types-", reqPitchers[1],"Ave. Speed",round(mean(subPitches$start_speed), digits=0),"Throws",batterHITHR$p_throws))
# splitting by the number of balls and Pitch Type
WhiffPitchTypes %>%
mutate(pitch_type =substr(pitch_type,1,2)) %>%
filter(pitcher_name %in% reqPitchers) %>%
group_by(count, pitch_type ) %>%
summarise(freq = n()) %>%
group_by(count) %>%
mutate(perc_pitches = freq/sum(freq),
ball_count = substr(count,1,2)) %>%
ggplot(aes(x = count, y = perc_pitches, fill = pitch_type))+
geom_bar(stat = 'identity', position = 'dodge')+
facet_wrap(~ball_count, scale = 'free') +
labs(title = paste("Count Situations - WHIFF vs Pitch Types", reqPitchers[1],"Ave. Speed",round(mean(subPitches$start_speed), digits=0),"Throws",batterHITHR$p_throws))
ggplot() +
geom_point(data=WhiffPitchTypes, aes(x=px, y=pz, shape=type, col=pitch_type)) +
coord_equal() + geom_path(aes(x, y), data = mlbgameday::kzone) +
labs(title = paste("WHIFF vs Pitch Types", reqPitchers[1],"Ave. Speed",round(mean(subPitches$start_speed), digits=0),"Throws",batterHITHR$p_throws))
# Lefty or righty batting stance
ggplot() +
geom_point(data=WhiffPitchTypes, aes(x=px, y=pz, shape=type, col=pitch_type)) +
facet_grid(. ~ stand) + coord_equal() +
geom_path(aes(x, y), data = mlbgameday::kzone) +
labs(title = paste("WHIFF vs Pitch Types", reqPitchers[1],"Ave. Speed",round(mean(subPitches$start_speed), digits=0),"Throws",batterHITHR$p_throws))
#which were strike outs
WhiffPitchTypes %>%
filter(event=="Strikeout") -> strikeoutWHIFF_PitchTypes
strikeoutWHIFF_PitchTypes %>%
# head() %>%
filter(pitcher_name %in% reqPitchers) %>% #reqPitchers
# filter(pitcher_name == "Aaron Nola") %>%
mutate(state = paste0(as.integer(!is.na(on_1b)),'_',as.integer(!is.na(on_2b)),'_',as.integer(!is.na(on_3b)))) %>%
mutate(pitch_type = substr(pitch_type,1,2)) %>%
group_by(state, pitch_type) %>%
summarise(freq = n()) %>%
group_by(state) %>%
mutate(perc_events = freq/sum(freq)) %>%
ggplot(aes( x= state, y = perc_events, fill = pitch_type)) +
geom_bar(stat = 'identity',position = 'dodge') +
labs(title = paste("Strikeout WHIFFs vs Pitch Types", reqPitchers[1],"Ave. Speed",round(mean(subPitches$start_speed), digits=0),"Throws",strikeoutWHIFF_PitchTypes$p_throws))
# splitting by the number of balls and Pitch Type
strikeoutWHIFF_PitchTypes %>%
mutate(pitch_type =substr(pitch_type,1,2)) %>%
filter(pitcher_name %in% reqPitchers) %>%
group_by(count, pitch_type ) %>%
summarise(freq = n()) %>%
group_by(count) %>%
mutate(perc_pitches = freq/sum(freq),
ball_count = substr(count,1,2)) %>%
ggplot(aes(x = count, y = perc_pitches, fill = pitch_type))+
geom_bar(stat = 'identity', position = 'dodge')+
facet_wrap(~ball_count, scale = 'free')+
labs(title = paste("Strikeout WHIFFs vs Pitch Types", reqPitchers[1],"Ave. Speed",round(mean(subPitches$start_speed), digits=0),"Throws",strikeoutWHIFF_PitchTypes$p_throws))
ggplot() +
geom_point(data=strikeoutWHIFF_PitchTypes, aes(x=px, y=pz, shape=type, col=pitch_type)) +
coord_equal() + geom_path(aes(x, y), data = mlbgameday::kzone) +
labs(title = paste("Strikeout WHIFFs vs Pitch Types", reqPitchers[1],"Ave. Speed",round(mean(subPitches$start_speed), digits=0),"Throws",strikeoutWHIFF_PitchTypes$p_throws))
# Lefty or righty batting stance
ggplot() +
geom_point(data=strikeoutWHIFF_PitchTypes, aes(x=px, y=pz, shape=type, col=pitch_type)) +
facet_grid(. ~ stand) + coord_equal() +
geom_path(aes(x, y), data = mlbgameday::kzone) +
labs(title = paste("Strikeout WHIFFs vs Pitch Types", reqPitchers[1],"Ave. Speed",round(mean(subPitches$start_speed), digits=0),"Throws",strikeoutWHIFF_PitchTypes$p_throws))
#strikeoutWHIFF_PitchTypes with Runner on 3rd. What pitch_type is thrown.
strikeoutWHIFF_PitchTypes %>%
filter(!is.na(on_3b)) -> PitchType_RunnerThirdWhiff
ggplot() +
geom_point(data=PitchType_RunnerThirdWhiff, aes(x=px, y=pz, shape=type, col=pitch_type)) +
coord_equal() + geom_path(aes(x, y), data = mlbgameday::kzone) +
labs(title = paste("WHIFF pitch type - runner on third", reqPitchers[1],"Throws",PitchType_RunnerThirdWhiff$p_throws))
ggplot() +
geom_point(data=PitchType_RunnerThirdWhiff, aes(x=px, y=pz, shape=type, col=pitch_type)) +
facet_grid(. ~ stand) + coord_equal() +
geom_path(aes(x, y), data = mlbgameday::kzone) +
labs(title = paste("WHIFF pitch type - runner on third", reqPitchers[1],"Throws",PitchType_RunnerThirdWhiff$p_throws))
#Fastball and Offspeed for different Count States #Examine Data. These are the first analysis completed to drill into the data
reqPitchers = unique(all_data$pitcher_name)[1:165] #All 165 pitchers to vector
all_data %>%
filter(pitch_type != "NA") %>%
filter(pitcher_name %in% reqPitchers) ->FastBallAnalyis #Filtered for pitchers throwing 75 or more pitches per game, both fastball and off-speed. -> 273724 observations
FastballTotal <- FastBallAnalyis %>% #All Fastballs
filter(pitch_type == "FC"| pitch_type == "FF" | pitch_type == "FS" | pitch_type == "FT" )
OffSpeedTotal <- FastBallAnalyis %>% #All Off-speed
filter(pitch_type == "CH"| pitch_type == "CU" | pitch_type == "EP" | pitch_type == "KC"| pitch_type == "KN" | pitch_type == "SC" | pitch_type == "SI"| pitch_type == "SL")
FastBall_PitcherFavState <- FastballTotal %>%
filter(count == "2-2" | count == "1-2" | count == "0-2" | count == "0-1" | count == "1-1")
FastBall_BatterFavState <- FastballTotal %>%
filter(count == "2-1" | count == "3-1" | count == "3-2" |count == "1-0" | count == "2-0" | count == "3-0")
OffSpeed_PitcherFavState <- OffSpeedTotal %>%
filter(count == "2-2" | count == "1-2" | count == "0-2" | count == "0-1" | count == "1-1")
OffSpeed_BatterFavState <- OffSpeedTotal %>%
filter(count == "2-1" | count == "3-1" | count == "3-2" |count == "1-0" | count == "2-0" | count == "3-0")
FirstPitchFastball <-FastballTotal %>%
filter(count == "0-0")
FirstPitchOffspeed <-OffSpeedTotal %>%
filter(count == "0-0")
#plot relevant information
dataforplot = FastBallAnalyis %>%
select(count, pitch_type) %>%
mutate(pitch_type2 = ifelse(substr(pitch_type, 1,1)=='F','FastBAll',"offSpeed"),
situation = ifelse(count %in% c("2-2" ,"1-2" ,"0-2" ,"0-1" ,"1-1"),'favPitcher',
ifelse(count== "0-0",'firstPitch','favBatter')))
# percentage of fast and offspeed balls in different situations.
datatemp = dataforplot %>%
group_by(pitch_type2, situation) %>%
summarise(count = n()) %>%
# filter(pitch_type2 =='FastBAll') %>%
filter(situation != 'firstPitch') %>%
group_by(pitch_type2) %>%
mutate(total = sum(count),
perc = count/total)
datatemp %>%
ggplot(aes(x = situation, y = perc, fill = situation))+
geom_bar(stat = 'identity')+
facet_wrap(~pitch_type2)+
scale_y_continuous(labels = percent)+
labs(title = paste("Fastball vs Off-speed when count is favorable and unfavorable"))