if(!require(dplyr)) install.packages("dplyr")
library(dplyr)
if(!require(Lahman)) install.packages("Lahman")
library(Lahman)
if(!require(stargazer)) install.packages("stargazer")
library(stargazer)
if(!require(magrittr)) install.packages("magrittr")
library(magrittr)
if(!require(rmarkdown)) install.packages("rmarkdown")
library(rmarkdown)
Q1 <- head(Pitching)
Q1
## playerID yearID stint teamID lgID W L G GS CG SHO SV IPouts H ER HR BB
## 1 bechtge01 1871 1 PH1 NA 1 2 3 3 2 0 0 78 43 23 0 11
## 2 brainas01 1871 1 WS3 NA 12 15 30 30 30 0 0 792 361 132 4 37
## 3 fergubo01 1871 1 NY2 NA 0 0 1 0 0 0 0 3 8 3 0 0
## 4 fishech01 1871 1 RC1 NA 4 16 24 24 22 1 0 639 295 103 3 31
## 5 fleetfr01 1871 1 NY2 NA 0 1 1 1 1 0 0 27 20 10 0 3
## 6 flowedi01 1871 1 TRO NA 0 0 1 0 0 0 0 3 1 0 0 0
## SO BAOpp ERA IBB WP HBP BK BFP GF R SH SF GIDP
## 1 1 NA 7.96 NA 7 NA 0 146 0 42 NA NA NA
## 2 13 NA 4.50 NA 7 NA 0 1291 0 292 NA NA NA
## 3 0 NA 27.00 NA 2 NA 0 14 0 9 NA NA NA
## 4 15 NA 4.35 NA 20 NA 0 1080 1 257 NA NA NA
## 5 0 NA 10.00 NA 0 NA 0 57 0 21 NA NA NA
## 6 0 NA 0.00 NA 0 NA 0 3 1 0 NA NA NA
Q2a <- filter(Pitching, SHO >= 12) %>%
paged_table()
# paged_table() is a function (in the package of "rmarkdown") to create page-able version of a data frame in the HTML environment. You don't need to do this in your assignment and can ignore it in the answers. Likewise in the following code.
Q2a
Q2b <- Q2a %>% filter(ERA <= 2) %>%
paged_table()
#or Q2b <- filter(Pitching, SHO >= 12 & ERA <= 2)
Q2b
Q3a <- filter(Pitching, yearID == 2019) %>%
paged_table()
Q3a
Q3b <- Q3a %>% select(playerID, teamID, SO, BB, ERA) %>%
paged_table()
Q3b
Q3c <- Q3b %>% mutate(SOBB=SO/BB) %>%
paged_table()
Q3c
Q3d <- Q3c %>%
filter(teamID == "CIN") %>%
arrange(desc(SOBB)) %>%
paged_table()
#or Q3d <- Q3c %>% arrange(teamID, desc(SOBB))
Q3d
#Q4a
mean(Q3a$ERA, na.rm=TRUE)
## [1] 6.02181
#Q3a is the data frame of Pitching in 2019 (as shown in the question 3a).
#Given there are missing values of ERA in 2019 Pitching data. The argument na.rm=T will listwisely remove the cases with missing values (of ERA).
median(Q3a$ERA, na.rm=TRUE)
## [1] 4.715
var(Q3a$ERA, na.rm=TRUE)
## [1] 62.11075
sd(Q3a$ERA, na.rm=TRUE)
## [1] 7.881037
IQR(Q3a$ERA, na.rm=TRUE)
## [1] 2.9375
#Q4b
Q3a %>%
select(CG, SO, BB, SHO, ERA) %>%
stargazer(type="text")
##
## =============================================================
## Statistic N Mean St. Dev. Min Pctl(25) Pctl(75) Max
## -------------------------------------------------------------
## CG 930 0.048 0.272 0 0 0 3
## SO 930 46.046 51.576 0 9 67 326
## BB 930 17.091 16.063 0 5 24 86
## SHO 930 0.028 0.183 0 0 0 2
## ERA 928 6.022 7.881 0.000 3.598 6.535 162.000
## -------------------------------------------------------------