Q1

Primary Keys are playerID, yearID, stint, and teamID,

Fielding returns an nrow of 24572 whilst Pitching returns an nrow of 0

Fielding %>% count(playerID, yearID, stint, teamID) %>% filter(n>1) %>% nrow()
## [1] 24572
Pitching %>% count(playerID, yearID, stint, teamID) %>% filter(n>1) %>% nrow()
## [1] 0

Q2

18095

See Output down below!

People %>% filter(birthCountry == "USA", birthState %in% c(state.abb, "DC")) %>% nrow()
## [1] 18095
People %>% count(birthState) %>% slice_max(n,n=5)
##   birthState    n
## 1         CA 2474
## 2         PA 1471
## 3         NY 1293
## 4         IL 1114
## 5         OH 1085

Q3

83 born 2000 or later and yes all of them having fielding data. 16 have opposite Batting and throwing hands

check <- People %>% filter(birthYear >= 2000) 
check %>% nrow() %>% cat()
## 83
fielding <- check %>% inner_join(Fielding, by = "playerID") %>% distinct(playerID)
fielding %>% nrow() %>% cat() 
## 83
hands <- check %>% filter(((bats == "L" & throws == "R") | (bats == "R" & throws == "L"))) 
hands %>% nrow() %>% cat()
## 16

Q4

A) 55

B) 4

C) 27.14545, 143.2857

check <- People %>% filter(birthYear >= 2000) 

Fielding %>% inner_join(check, by = "playerID") %>% group_by(playerID) %>% summarise(y = n_distinct(yearID)) %>% filter(y == 1) %>% nrow() %>% cat()
## 55
Fielding %>% inner_join(check, by = "playerID") %>% group_by(playerID) %>% summarise(y = n_distinct(yearID)) %>%  slice_max(y) 
## # A tibble: 1 × 2
##   playerID      y
##   <chr>     <int>
## 1 garcilu04     4
sum <- Fielding %>% inner_join(check, by = "playerID") %>% group_by(playerID) %>% summarise(tg = sum(G), stats = n_distinct(yearID)) %>% ungroup()

sum %>% filter(stats == 1) %>% summarise(y = mean(tg)) %>% pull(y) %>% cat()
## 27.14545
sum %>% filter(stats > 1) %>% summarise(y = mean(tg)) %>% pull(y) %>% cat()
## 143.2857

Q5

A) 10569

B) Pitcher first at 42628

C) 59, 54, 49, 41, 35

D) 617. 333, 6

E)

yearID teamID stint Batting Pitching

1 2018 LAA 1 22 6

2 2019 LAA 1 18 0

3 2020 LAA 1 7 0

4 2021 LAA 1 46 15

5 2022 LAA 1 34 14

6 2023 LAA 1 44 18

cond <- inner_join(Batting, Pitching, by = c("playerID","yearID","teamID","stint"), suffix = c("_bat","_pitch")) 
nrow(cond)
## [1] 51368
cond <- cond %>% mutate(type = case_when( HR_pitch > HR_bat ~ "pitcher first", HR_bat > HR_pitch ~ "two way", TRUE ~ "equal" ))
cond %>% count(type)
##            type     n
## 1         equal  7856
## 2 pitcher first 42628
## 3       two way   884
cond %>% filter(HR_bat > HR_pitch) %>% mutate(diff = HR_bat - HR_pitch) %>% arrange(desc(diff)) %>% slice_head(n = 5) %>% select(HR_bat)
##   HR_bat
## 1     59
## 2     54
## 3     49
## 4     41
## 5     35
tway <- cond %>% filter(type == "two way") %>% distinct(playerID) %>% left_join(People, by = "playerID")

nrow(tway) 
## [1] 617
sum(is.na(tway$deathYear))               
## [1] 333
Uncommon = suppressWarnings(invisible(People %>% filter(playerID %in% (inner_join(Batting, Pitching, by = "playerID", relationship = "many-to-many") %>% distinct(playerID))$playerID))) %>% filter(!is.na(birthMonth)) %>% count(birthMonth, sort = TRUE) %>% slice_tail(n = 1) %>% pull(birthMonth) %>% print()
## [1] 6
Batting %>%  filter(playerID == "ohtansh01") %>% select(yearID, teamID, stint, Batting = HR) %>% left_join(Pitching %>% filter(playerID == "ohtansh01") %>% select(yearID, teamID, stint, Pitching = HR), by = c("yearID", "teamID", "stint")) %>% replace_na(list(Pitching = 0)) %>% arrange(yearID)
##   yearID teamID stint Batting Pitching
## 1   2018    LAA     1      22        6
## 2   2019    LAA     1      18        0
## 3   2020    LAA     1       7        0
## 4   2021    LAA     1      46       15
## 5   2022    LAA     1      34       14
## 6   2023    LAA     1      44       18

Part 2

return <- expand.grid(origin = c("EWR","JFK","LGA"), date = seq(as.Date("2013-01-01"), as.Date("2013-12-31"), "day"), hour = 0:23)

return %>% anti_join(weather %>% mutate(date = as.Date(time_hour)) %>% select(origin, date, hour), by = c("origin","date","hour")) %>% count(date, origin) %>% pivot_wider(names_from = origin, values_from = n) %>% arrange(date) %>% print(n = Inf)
## # A tibble: 25 × 4
##    date         EWR   JFK   LGA
##    <date>     <int> <int> <int>
##  1 2013-01-01     7     7     6
##  2 2013-01-06    NA    NA     1
##  3 2013-02-18     1    NA    NA
##  4 2013-02-20     1    NA    NA
##  5 2013-02-21     1     1     1
##  6 2013-02-23    NA    NA     1
##  7 2013-03-05    NA     1     1
##  8 2013-03-10     1     1     1
##  9 2013-04-03    NA     1    NA
## 10 2013-07-02     2    NA    NA
## 11 2013-07-31     1    NA     1
## 12 2013-08-13    NA     1     1
## 13 2013-08-16    NA     1     1
## 14 2013-08-19     1     1     1
## 15 2013-08-22     1     1    NA
## 16 2013-08-23     2     2     2
## 17 2013-09-02     1    NA    NA
## 18 2013-10-23     2    NA    NA
## 19 2013-10-26     5     5     5
## 20 2013-10-27     1     1     1
## 21 2013-11-01    NA     2     2
## 22 2013-11-03     6     6     6
## 23 2013-11-04     1     1     1
## 24 2013-12-17     1    NA    NA
## 25 2013-12-31    24    24    24