Import Greinke dataset
greinke <- read.csv("greinke.csv")
# Print the first 6 rows of the data
head(greinke)
## p_name pitcher_id batter_stand pitch_type pitch_result
## 1 Zack Greinke 425844 R FF Ball
## 2 Zack Greinke 425844 R FF Swinging Strike
## 3 Zack Greinke 425844 R FF Called Strike
## 4 Zack Greinke 425844 R SL Swinging Strike
## 5 Zack Greinke 425844 R FF Swinging Strike
## 6 Zack Greinke 425844 R SL Swinging Strike
## atbat_result start_speed z0 x0 pfx_x pfx_z px pz
## 1 Walk 94.2 5.997 -0.675 -4.457 9.760 1.714 1.925
## 2 Single 92.4 6.281 -0.760 -1.590 11.400 0.589 3.271
## 3 Home Run 92.7 6.168 -0.958 -1.884 9.245 0.399 2.918
## 4 Strikeout 86.9 6.077 -0.939 3.594 0.762 0.764 1.306
## 5 Strikeout 92.8 6.107 -0.524 -0.558 11.134 1.517 2.193
## 6 Strikeout 87.8 6.321 -0.948 4.313 0.132 0.695 3.431
## break_angle break_length spin_rate spin_dir balls strikes outs game_date
## 1 24.8 3.5 2188.802 204.457 2 2 2 10/3/2015
## 2 10.1 2.7 2312.202 187.913 1 1 0 10/3/2015
## 3 9.2 3.5 1889.841 191.468 0 0 1 10/3/2015
## 4 -11.4 8.0 693.649 102.648 1 2 0 10/3/2015
## 5 -0.4 2.8 2242.916 182.859 1 2 0 10/3/2015
## 6 -13.6 7.8 828.693 92.330 2 2 1 10/3/2015
## inning inning_topbot batted_ball_type batted_ball_velocity hc_x hc_y
## 1 4 top NA 0.00 0.00
## 2 3 top 104 123.56 97.26
## 3 5 top 103 50.88 31.17
## 4 6 top NA 0.00 0.00
## 5 8 top NA 0.00 0.00
## 6 1 top NA 0.00 0.00
## pitch_id distance_feet
## 1 160 NA
## 2 95 0
## 3 218 425
## 4 265 NA
## 5 374 NA
## 6 14 NA
# Print the number of rows in the data frame
nrow(greinke)
## [1] 3239
# Summarize the start_speed variable
summary(greinke$start_speed)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 52.20 87.30 89.80 88.44 91.80 95.40 3
# Get rid of data without start_speed
greinke <- subset(greinke, !is.na(start_speed))
# Print the number of complete entries
nrow(greinke)
## [1] 3236
# Print the structure of greinke
str(greinke)
## 'data.frame': 3236 obs. of 29 variables:
## $ p_name : Factor w/ 1 level "Zack Greinke": 1 1 1 1 1 1 1 1 1 1 ...
## $ pitcher_id : int 425844 425844 425844 425844 425844 425844 425844 425844 425844 425844 ...
## $ batter_stand : Factor w/ 2 levels "L","R": 2 2 2 2 2 2 2 2 2 2 ...
## $ pitch_type : Factor w/ 8 levels "","CH","CU","EP",..: 5 5 5 8 5 8 2 5 8 8 ...
## $ pitch_result : Factor w/ 15 levels "Ball","Ball In Dirt",..: 1 14 3 14 14 14 15 3 4 14 ...
## $ atbat_result : Factor w/ 24 levels "Bunt Groundout",..: 24 20 12 21 21 21 21 21 10 24 ...
## $ start_speed : num 94.2 92.4 92.7 86.9 92.8 87.8 90.3 92.7 85.5 87.3 ...
## $ z0 : num 6 6.28 6.17 6.08 6.11 ...
## $ x0 : num -0.675 -0.76 -0.958 -0.939 -0.524 ...
## $ pfx_x : num -4.457 -1.59 -1.884 3.594 -0.558 ...
## $ pfx_z : num 9.76 11.4 9.245 0.762 11.134 ...
## $ px : num 1.714 0.589 0.399 0.764 1.517 ...
## $ pz : num 1.93 3.27 2.92 1.31 2.19 ...
## $ break_angle : num 24.8 10.1 9.2 -11.4 -0.4 -13.6 22.5 25.1 -8.4 -11.3 ...
## $ break_length : num 3.5 2.7 3.5 8 2.8 7.8 7.4 3.8 7.5 7.4 ...
## $ spin_rate : num 2189 2312 1890 694 2243 ...
## $ spin_dir : num 204 188 191 103 183 ...
## $ balls : int 2 1 0 1 1 2 1 0 0 0 ...
## $ strikes : int 2 1 0 2 2 2 2 2 0 1 ...
## $ outs : int 2 0 1 0 0 1 1 2 2 2 ...
## $ game_date : Factor w/ 32 levels "10/3/2015","4/12/2015",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ inning : int 4 3 5 6 8 1 6 5 8 4 ...
## $ inning_topbot : Factor w/ 2 levels "bot","top": 2 2 2 2 2 2 2 2 2 2 ...
## $ batted_ball_type : Factor w/ 5 levels "","FB","GB","LD",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ batted_ball_velocity: int NA 104 103 NA NA NA NA NA NA NA ...
## $ hc_x : num 0 123.6 50.9 0 0 ...
## $ hc_y : num 0 97.3 31.2 0 0 ...
## $ pitch_id : int 160 95 218 265 374 14 279 231 386 156 ...
## $ distance_feet : int NA 0 425 NA NA NA NA NA NA NA ...
# Check if dates are formatted as dates
class(greinke$game_date)
## [1] "factor"
# Change them to dates
greinke$game_date <- as.Date(greinke$game_date, format = "%m/%d/%Y")
# Check that the variable is now formatted as a date
class(greinke$game_date)
## [1] "Date"
library(tidyr)
# Separate game_date into "year", "month", and "day"
greinke <- separate(data = greinke, col = game_date,
into = c("year", "month", "day"),
sep = "-", remove = FALSE)
# Convert month to numeric
greinke$month <- as.numeric(greinke$month)
# Create the july variable
greinke$july <- ifelse(greinke$month == 7, "july", "other")
# View the head() of greinke
head(greinke)
## p_name pitcher_id batter_stand pitch_type pitch_result
## 1 Zack Greinke 425844 R FF Ball
## 2 Zack Greinke 425844 R FF Swinging Strike
## 3 Zack Greinke 425844 R FF Called Strike
## 4 Zack Greinke 425844 R SL Swinging Strike
## 5 Zack Greinke 425844 R FF Swinging Strike
## 6 Zack Greinke 425844 R SL Swinging Strike
## atbat_result start_speed z0 x0 pfx_x pfx_z px pz
## 1 Walk 94.2 5.997 -0.675 -4.457 9.760 1.714 1.925
## 2 Single 92.4 6.281 -0.760 -1.590 11.400 0.589 3.271
## 3 Home Run 92.7 6.168 -0.958 -1.884 9.245 0.399 2.918
## 4 Strikeout 86.9 6.077 -0.939 3.594 0.762 0.764 1.306
## 5 Strikeout 92.8 6.107 -0.524 -0.558 11.134 1.517 2.193
## 6 Strikeout 87.8 6.321 -0.948 4.313 0.132 0.695 3.431
## break_angle break_length spin_rate spin_dir balls strikes outs
## 1 24.8 3.5 2188.802 204.457 2 2 2
## 2 10.1 2.7 2312.202 187.913 1 1 0
## 3 9.2 3.5 1889.841 191.468 0 0 1
## 4 -11.4 8.0 693.649 102.648 1 2 0
## 5 -0.4 2.8 2242.916 182.859 1 2 0
## 6 -13.6 7.8 828.693 92.330 2 2 1
## game_date year month day inning inning_topbot batted_ball_type
## 1 2015-10-03 2015 10 03 4 top
## 2 2015-10-03 2015 10 03 3 top
## 3 2015-10-03 2015 10 03 5 top
## 4 2015-10-03 2015 10 03 6 top
## 5 2015-10-03 2015 10 03 8 top
## 6 2015-10-03 2015 10 03 1 top
## batted_ball_velocity hc_x hc_y pitch_id distance_feet july
## 1 NA 0.00 0.00 160 NA other
## 2 104 123.56 97.26 95 0 other
## 3 103 50.88 31.17 218 425 other
## 4 NA 0.00 0.00 265 NA other
## 5 NA 0.00 0.00 374 NA other
## 6 NA 0.00 0.00 14 NA other
# Print a summary of the july variable
summary(factor(greinke$july))
## july other
## 524 2712
# Make a histogram of Greinke's start speed
hist(greinke$start_speed)
# Create greinke_july
greinke_july <- subset(greinke, july == "july")
# Create greinke_other
greinke_other <- subset(greinke, july == "other")
# Use par to format your plot layout
par(mfrow = c(1,2))
# Plot start_speed histogram from july
hist(greinke_july$start_speed)
# Plot start_speed histogram for other months
hist(greinke_other$start_speed)
# Create july_ff
july_ff <- subset(greinke_july, pitch_type == "FF")
# Create other_ff
other_ff <- subset(greinke_other, pitch_type == "FF")
# Formatting code, don't change this
par(mfrow = c(1, 2))
# Plot histogram of July fastball speeds
hist(july_ff$start_speed)
# Plot histogram of other month fastball speeds
hist(other_ff$start_speed)
# Make a fastball speed histogram for other months
hist(other_ff$start_speed,
col = "#00009950", freq = FALSE,
ylim = c(0, .35), xlab = "Velocity (mph)",
main = "Greinke 4-Seam Fastball Velocity")
# Add a histogram for July
hist(july_ff$start_speed, add = TRUE,
col = "#99000050", freq = FALSE)
# Draw vertical line at the mean of other_ff
abline(v = mean(other_ff$start_speed),
col = "#00009950", lwd = 2)
# Draw vertical line at the mean of july_ff
abline(v = mean(july_ff$start_speed),
col = "#99000050", lwd = 2)
# Summarize velocity in July and other months
tapply(greinke$start_speed, greinke$july, mean)
## july other
## 88.86489 88.35601
# Create greinke_ff
greinke_ff <- subset(greinke, pitch_type == "FF")
# Calculate mean fastball velocities: ff_velo_month
ff_velo_month <- tapply(greinke_ff$start_speed, greinke_ff$july, mean)
# Print ff_velo_month
ff_velo_month
## july other
## 92.42077 91.66474
# Create ff_dt
ff_dt <- data.frame(tapply(greinke_ff$start_speed, greinke_ff$game_date, mean))
# Print the first 6 rows of ff_dt
head(ff_dt)
## tapply.greinke_ff.start_speed..greinke_ff.game_date..mean.
## 2015-04-07 90.82632
## 2015-04-12 90.51622
## 2015-04-18 90.28654
## 2015-04-24 90.51277
## 2015-04-29 90.40732
## 2015-05-05 90.33043
# Create game_date in ff_dt
ff_dt$game_date <- as.Date(row.names(ff_dt), "%Y-%m-%d")
# Rename the first column
colnames(ff_dt)[1] <- "start_speed"
# Remove row names
row.names(ff_dt) <- NULL
# View head of ff_dt
head(ff_dt)
## start_speed game_date
## 1 90.82632 2015-04-07
## 2 90.51622 2015-04-12
## 3 90.28654 2015-04-18
## 4 90.51277 2015-04-24
## 5 90.40732 2015-04-29
## 6 90.33043 2015-05-05
# Plot game-by-game 4-seam fastballs
plot(ff_dt$start_speed ~ ff_dt$game_date,
lwd = 4, type = "l", ylim = c(88, 95),
main = "Greinke 4-Seam Fastball Velocity", xlab = "Date", ylab = "Velocity (mph)")
# Code from previous exercise, don't change this
plot(ff_dt$start_speed ~ ff_dt$game_date,
lwd = 4, type = "l", ylim = c(88, 95),
main = "Greinke 4-Seam Fastball Velocity",
xlab = "Date", ylab = "Velocity (mph)")
# Add jittered points to the plot
points(greinke_ff$start_speed ~ jitter(as.numeric(greinke_ff$game_date)),
pch = 16, col = "#99004450")
# Subset the data to remove pitch types "IN" and "EP"
greinke <- subset(greinke, pitch_type != "IN" & pitch_type != "EP")
# Drop the levels from pitch_type
greinke$pitch_type <- droplevels(greinke$pitch_type)
# Create type_tab
type_tab <- table(greinke$pitch_type, greinke$july)
# Print type_tab
type_tab
##
## july other
## CH 112 487
## CU 51 242
## FF 207 1191
## FT 66 255
## SL 86 535
# Create type_prop table
type_prop <- round(prop.table(type_tab, margin = 2), 3)
# Print type_prop
type_prop
##
## july other
## CH 0.215 0.180
## CU 0.098 0.089
## FF 0.397 0.439
## FT 0.126 0.094
## SL 0.165 0.197
# Create ff_prop
ff_prop <- type_prop[3, ]
# Print ff_prop
ff_prop
## july other
## 0.397 0.439
# Print ff_velo_month
ff_velo_month
## july other
## 92.42077 91.66474
type_prop <- as.data.frame.matrix(type_prop)
Pitch <- rownames(type_prop)
type_prop <- cbind(Pitch, type_prop)
colnames(type_prop) <- paste(c("Pitch", "July", "Other"))
# Create the Difference column
type_prop$Difference <- (type_prop$July - type_prop$Other) / type_prop$Other
# Print the type_prop
type_prop
## Pitch July Other Difference
## CH CH 0.215 0.180 0.19444444
## CU CU 0.098 0.089 0.10112360
## FF FF 0.397 0.439 -0.09567198
## FT FT 0.126 0.094 0.34042553
## SL SL 0.165 0.197 -0.16243655
# Plot a barplot
barplot(type_prop$Difference, names.arg = type_prop$Pitch,
main = "Pitch Usage in July vs. Other Months",
ylab = "Percentage Change in July",
ylim = c(-0.3, 0.3))