For this project we should use these packages to to clean and visualize our dataset.
pacman::p_load(tidyverse,rio,ggplot2,ggpubr,ggsci,vtable,rmarkdown)
Next we can use the read_csv function to load our dataset into R. To save extra typing, the dataset will be referred to as “mlb” from now on and is saved as a variable under that name.
mlb = read.csv('https://www.kaggle.com/datasets/mattop/mlb-batting-exit-velocity-data-2015-2022?select=mlb-batter-exit-velocity.csv')
mlb_batter_exit_velocity <- read_csv("R_Files/R Course Files/mlb-batter-exit-velocity.csv")
## Rows: 1983 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): player
## dbl (19): id, rank, year, batted_ball_events, launch_angle, sweet_spot_perce...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
mlb = mlb_batter_exit_velocity
We can run some summary statistics on the MLB data. The first function gives some descriptive statistics for the data and the second function creates a nice table for us to view the descriptive statistics in. Some of the variables such as ID, Rank and Year are less useful/interesting to get summary statistics on and can be excluded from this section by creating a second dataframe that removes these columns. We will only use that dataframe for this table.
mlb2 = mlb[,-1:-3]
summary(mlb2)
## player batted_ball_events launch_angle sweet_spot_percentage
## Length:1983 Min. : 67.0 Min. :-7.90 Min. :18.50
## Class :character 1st Qu.:256.0 1st Qu.: 9.50 1st Qu.:30.90
## Mode :character Median :333.0 Median :12.50 Median :33.50
## Mean :328.7 Mean :12.53 Mean :33.59
## 3rd Qu.:415.0 3rd Qu.:15.55 3rd Qu.:36.30
## Max. :599.0 Max. :26.80 Max. :49.70
##
## max_ev average_ev fly_ball_line_drive_ev ground_ball_ev
## Min. : 98.5 Min. :78.20 Min. : 83.00 Min. :71.20
## 1st Qu.:108.6 1st Qu.:87.35 1st Qu.: 91.20 1st Qu.:84.10
## Median :110.6 Median :88.80 Median : 92.90 Median :85.90
## Mean :110.8 Mean :88.77 Mean : 92.81 Mean :85.86
## 3rd Qu.:112.9 3rd Qu.:90.10 3rd Qu.: 94.50 3rd Qu.:87.60
## Max. :122.2 Max. :96.00 Max. :101.30 Max. :94.10
##
## max_distance average_distance average_homerun hard_hit_95mph+
## Min. :359 Min. : 98.0 Min. :342.0 Min. : 6.0
## 1st Qu.:423 1st Qu.:158.0 1st Qu.:392.0 1st Qu.: 87.5
## Median :436 Median :172.0 Median :399.0 Median :120.0
## Mean :436 Mean :171.4 Mean :398.3 Mean :122.3
## 3rd Qu.:449 3rd Qu.:186.0 3rd Qu.:406.0 3rd Qu.:157.0
## Max. :505 Max. :234.0 Max. :438.0 Max. :274.0
## NA's :9
## hard_hit_percentage hard_hit_swing_percentage total_barrels
## Min. : 6.40 Min. : 2.30 Min. : 0.00
## 1st Qu.:32.80 1st Qu.:12.30 1st Qu.:12.00
## Median :38.20 Median :14.20 Median :21.00
## Mean :37.57 Mean :14.14 Mean :23.92
## 3rd Qu.:42.70 3rd Qu.:16.00 3rd Qu.:33.00
## Max. :62.20 Max. :23.90 Max. :91.00
##
## barrels_batted_balls_percentage barrels_plate_appearance_percentage
## Min. : 0.000 Min. : 0.000
## 1st Qu.: 4.500 1st Qu.: 3.200
## Median : 7.100 Median : 4.900
## Mean : 7.521 Mean : 5.021
## 3rd Qu.:10.150 3rd Qu.: 6.700
## Max. :26.700 Max. :15.800
##
st(mlb2,vars = , title = "MLB Exit Velocity Summary Statistics")
| Variable | N | Mean | Std. Dev. | Min | Pctl. 25 | Pctl. 75 | Max |
|---|---|---|---|---|---|---|---|
| batted_ball_events | 1983 | 329 | 113 | 67 | 256 | 415 | 599 |
| launch_angle | 1983 | 13 | 4.6 | -7.9 | 9.5 | 16 | 27 |
| sweet_spot_percentage | 1983 | 34 | 4.1 | 18 | 31 | 36 | 50 |
| max_ev | 1983 | 111 | 3.2 | 98 | 109 | 113 | 122 |
| average_ev | 1983 | 89 | 2.2 | 78 | 87 | 90 | 96 |
| fly_ball_line_drive_ev | 1983 | 93 | 2.6 | 83 | 91 | 94 | 101 |
| ground_ball_ev | 1983 | 86 | 2.6 | 71 | 84 | 88 | 94 |
| max_distance | 1983 | 436 | 20 | 359 | 423 | 449 | 505 |
| average_distance | 1983 | 171 | 20 | 98 | 158 | 186 | 234 |
| average_homerun | 1974 | 398 | 11 | 342 | 392 | 406 | 438 |
| hard_hit_95mph+ | 1983 | 122 | 48 | 6 | 88 | 157 | 274 |
| hard_hit_percentage | 1983 | 38 | 7.9 | 6.4 | 33 | 43 | 62 |
| hard_hit_swing_percentage | 1983 | 14 | 2.9 | 2.3 | 12 | 16 | 24 |
| total_barrels | 1983 | 24 | 15 | 0 | 12 | 33 | 91 |
| barrels_batted_balls_percentage | 1983 | 7.5 | 4.2 | 0 | 4.5 | 10 | 27 |
| barrels_plate_appearance_percentage | 1983 | 5 | 2.5 | 0 | 3.2 | 6.7 | 16 |
#This line of code renames a few variables, to simplify a few of their names.
mlb = rename(mlb, hard_hit_count = "hard_hit_95mph+", average_homerun_distance = "average_homerun", average_hit_distance = "average_distance")
#Plot for average exit velocity
ggplot(data = mlb, aes(x = average_ev)) +
geom_histogram(bins = 20, aes(y =..density..),color = "black",fill = "white")+
geom_density(alpha = .2, fill="#ff6666")+
scale_y_continuous(name = "Density",sec.axis = sec_axis(~.*1500, name = "Count", breaks = seq(0,350,50)))+
theme(axis.title.y.left = element_text(vjust = +3),axis.title.y.right = element_text(vjust = +3))+
xlab("Average Exit Velocity (mph)")+
ylab("Density")+
ggtitle("Average Exit Velocity Distribution")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#Plot for max homerun distance
ggplot(data = mlb, aes(x = max_distance)) +
geom_histogram(bins = 25, aes(y =..density..),color = "darkblue",fill = "white")+
geom_density(alpha = .2, fill="cyan")+
scale_y_continuous(name = "Density",sec.axis = sec_axis(~.*25000, name = "Count", breaks = seq(0,500,100)))+
theme(axis.title.y.left = element_text(vjust = +3),axis.title.y.right = element_text(vjust = +3))+
xlab("Maximum Homerun Distance (ft)")+
ylab("Density")+
ggtitle("Maximum Homerun Distance Distribution")
#Plot for launch angle
ggplot(data = mlb, aes(x = launch_angle)) +
geom_histogram(bins = 20, aes(y = ..density..), color = "purple", fill = "white")+
geom_density(alpha = .2, fill = "magenta")+
scale_y_continuous(name = "Density",sec.axis = sec_axis(~.*3500, name = "Count", breaks = seq(0,350,50)))+
theme(axis.title.y.left = element_text(vjust = +3),axis.title.y.right = element_text(vjust = +3))+
xlab("Launch Angle (°)")+
ylab("Density")+
ggtitle("Launch Angle Distribution")
plot1 = ggplot(data = mlb, aes(x = hard_hit_percentage))+
geom_boxplot(fill = "lightblue", color = "black")+
xlab("Hard Hit Percentage")+
theme_classic()+
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank()
)
plot2 = ggplot(data = mlb, aes(x = hard_hit_percentage))+
geom_histogram(bins = 30, aes(x = hard_hit_percentage), fill = "lightblue",color = "black")+
ylab("Relative Frequency")+
xlab("Hard Hit Percentage")+
theme_classic()
cowplot::plot_grid(plot2,plot1,
ncol = 1, rel_heights = c(2,1),
align = "v",axis = "lr")
plot3 = ggplot(data = mlb, aes(x = sweet_spot_percentage))+
geom_boxplot(fill = "gold", color = "black")+
xlab("Sweet Spot Percentage")+
theme(axis.ticks.y = element_blank(),axis.text.y = element_blank())+
theme_classic()+
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank()
)
plot4 = ggplot(data = mlb, aes(x = sweet_spot_percentage))+
geom_histogram(bins = 30, aes(x = hard_hit_percentage), fill = "gold", color = "black")+
xlab("Sweet Spot Percentage")+
ylab("Relative Frequency")+
theme_classic()
cowplot::plot_grid(plot4,plot3,
ncol = 1, rel_heights = c(4,3),
align = "v",axis = "lr")
ggplot(data = mlb,mapping = aes(x = hard_hit_percentage,y=average_ev))+
geom_point(aes(color=max_ev))+
geom_smooth(method = lm)+
stat_regline_equation(label.y = 98, aes(label = ..eq.label..))+
stat_regline_equation(label.y = 96, aes(label = ..rr.label..))+
scale_color_gradient(low = "blue",high = "red")+
guides(fill = guide_legend(
direction = "horizontal",
title.position = "top",
label.position = "bottom",
label.hjust = 1))+
labs(title = "Hard Hit Percentage vs. Average Exit Velocity")+
labs(x= "Hard Hit Percentage (%)",y ="Average Exit Velocity (mph)",color = "Maximum Exit Velocity")
## `geom_smooth()` using formula = 'y ~ x'
ggplot(data = mlb, mapping = aes(x = hard_hit_count, y = total_barrels))+
geom_point(aes(color=max_ev))+
geom_smooth(method = lm)+
theme_classic()+
stat_regline_equation(label.y = 89, aes(label = ..eq.label..))+
stat_regline_equation(label.y = 82, aes(label = ..rr.label..))+
scale_color_gradient(low = "blue",high = "red")+
guides(fill = guide_legend(
direction = "horizontal",
title.position = "top",
label.position = "bottom",
label.hjust = 1))+
labs(title = "Hard Hit Count vs. Total Barrels")+
labs(x = "Hard Hit Count", y = "Total Barrels", color = "Maximum Exit Velocity")
## `geom_smooth()` using formula = 'y ~ x'