if(!require(dplyr)) install.packages("dplyr")
library(dplyr)
if(!require(Lahman)) install.packages("Lahman")
library(Lahman)
if(!require(stargazer)) install.packages("stargazer")
library(stargazer)
if(!require(magrittr)) install.packages("magrittr")
library(magrittr)
if(!require(rmarkdown)) install.packages("rmarkdown")
library(rmarkdown)
Q1 <- head(Pitching)
Q1
##    playerID yearID stint teamID lgID  W  L  G GS CG SHO SV IPouts   H  ER HR BB
## 1 bechtge01   1871     1    PH1   NA  1  2  3  3  2   0  0     78  43  23  0 11
## 2 brainas01   1871     1    WS3   NA 12 15 30 30 30   0  0    792 361 132  4 37
## 3 fergubo01   1871     1    NY2   NA  0  0  1  0  0   0  0      3   8   3  0  0
## 4 fishech01   1871     1    RC1   NA  4 16 24 24 22   1  0    639 295 103  3 31
## 5 fleetfr01   1871     1    NY2   NA  0  1  1  1  1   0  0     27  20  10  0  3
## 6 flowedi01   1871     1    TRO   NA  0  0  1  0  0   0  0      3   1   0  0  0
##   SO BAOpp   ERA IBB WP HBP BK  BFP GF   R SH SF GIDP
## 1  1    NA  7.96  NA  7  NA  0  146  0  42 NA NA   NA
## 2 13    NA  4.50  NA  7  NA  0 1291  0 292 NA NA   NA
## 3  0    NA 27.00  NA  2  NA  0   14  0   9 NA NA   NA
## 4 15    NA  4.35  NA 20  NA  0 1080  1 257 NA NA   NA
## 5  0    NA 10.00  NA  0  NA  0   57  0  21 NA NA   NA
## 6  0    NA  0.00  NA  0  NA  0    3  1   0 NA NA   NA
Q2a <- filter(Pitching, SHO >= 12) %>%  
       paged_table()
# paged_table() is a function (in the package of "rmarkdown") to create page-able version of a data frame in the HTML environment. You don't need to do this in your assignment and can ignore it in the answers. Likewise in the following code.
Q2a
Q2b <- Q2a %>% filter(ERA <= 2) %>%  
       paged_table()
#or Q2b <- filter(Pitching, SHO >= 12 & ERA <= 2)
Q2b
Q3a <- filter(Pitching, yearID == 2019) %>%
       paged_table() 
Q3a 
Q3b <- Q3a %>% select(playerID, teamID, SO, BB, ERA) %>%
       paged_table()
Q3b
Q3c <- Q3b %>% mutate(SOBB=SO/BB) %>%
       paged_table()
Q3c
Q3d <- Q3c %>% 
       filter(teamID == "CIN") %>% 
       arrange(desc(SOBB)) %>%
       paged_table()
#or Q3d <- Q3c %>% arrange(teamID, desc(SOBB))
Q3d
#Q4a
mean(Q3a$ERA, na.rm=TRUE) 
## [1] 6.02181
#Q3a is the data frame of Pitching in 2019 (as shown in the question 3a).
#Given there are missing values of ERA in 2019 Pitching data. The argument na.rm=T will listwisely remove the cases with missing values (of ERA).
median(Q3a$ERA, na.rm=TRUE)
## [1] 4.715
var(Q3a$ERA, na.rm=TRUE)
## [1] 62.11075
sd(Q3a$ERA, na.rm=TRUE)
## [1] 7.881037
IQR(Q3a$ERA, na.rm=TRUE)
## [1] 2.9375
#Q4b
Q3a %>% 
  select(CG, SO, BB, SHO, ERA) %>% 
  stargazer(type="text")
## 
## =============================================================
## Statistic  N   Mean  St. Dev.  Min  Pctl(25) Pctl(75)   Max  
## -------------------------------------------------------------
## CG        930 0.048   0.272     0      0        0        3   
## SO        930 46.046  51.576    0      9        67      326  
## BB        930 17.091  16.063    0      5        24      86   
## SHO       930 0.028   0.183     0      0        0        2   
## ERA       928 6.022   7.881   0.000  3.598    6.535   162.000
## -------------------------------------------------------------