Q1
Primary Keys are playerID, yearID, stint, and teamID,
Fielding returns an nrow of 24572 whilst Pitching returns an nrow of
0
Fielding %>% count(playerID, yearID, stint, teamID) %>% filter(n>1) %>% nrow()
## [1] 24572
Pitching %>% count(playerID, yearID, stint, teamID) %>% filter(n>1) %>% nrow()
## [1] 0
Q2
18095
See Output down below!
People %>% filter(birthCountry == "USA", birthState %in% c(state.abb, "DC")) %>% nrow()
## [1] 18095
People %>% count(birthState) %>% slice_max(n,n=5)
## birthState n
## 1 CA 2474
## 2 PA 1471
## 3 NY 1293
## 4 IL 1114
## 5 OH 1085
Q3
83 born 2000 or later and yes all of them having fielding data. 16
have opposite Batting and throwing hands
check <- People %>% filter(birthYear >= 2000)
check %>% nrow() %>% cat()
## 83
fielding <- check %>% inner_join(Fielding, by = "playerID") %>% distinct(playerID)
fielding %>% nrow() %>% cat()
## 83
hands <- check %>% filter(((bats == "L" & throws == "R") | (bats == "R" & throws == "L")))
hands %>% nrow() %>% cat()
## 16
Q4
A) 55
B) 4
C) 27.14545, 143.2857
check <- People %>% filter(birthYear >= 2000)
Fielding %>% inner_join(check, by = "playerID") %>% group_by(playerID) %>% summarise(y = n_distinct(yearID)) %>% filter(y == 1) %>% nrow() %>% cat()
## 55
Fielding %>% inner_join(check, by = "playerID") %>% group_by(playerID) %>% summarise(y = n_distinct(yearID)) %>% slice_max(y)
## # A tibble: 1 × 2
## playerID y
## <chr> <int>
## 1 garcilu04 4
sum <- Fielding %>% inner_join(check, by = "playerID") %>% group_by(playerID) %>% summarise(tg = sum(G), stats = n_distinct(yearID)) %>% ungroup()
sum %>% filter(stats == 1) %>% summarise(y = mean(tg)) %>% pull(y) %>% cat()
## 27.14545
sum %>% filter(stats > 1) %>% summarise(y = mean(tg)) %>% pull(y) %>% cat()
## 143.2857
Q5
A) 10569
B) Pitcher first at 42628
C) 59, 54, 49, 41, 35
D) 617. 333, 6
E)
yearID teamID stint Batting Pitching
1 2018 LAA 1 22 6
2 2019 LAA 1 18 0
3 2020 LAA 1 7 0
4 2021 LAA 1 46 15
5 2022 LAA 1 34 14
6 2023 LAA 1 44 18
cond <- inner_join(Batting, Pitching, by = c("playerID","yearID","teamID","stint"), suffix = c("_bat","_pitch"))
nrow(cond)
## [1] 51368
cond <- cond %>% mutate(type = case_when( HR_pitch > HR_bat ~ "pitcher first", HR_bat > HR_pitch ~ "two way", TRUE ~ "equal" ))
cond %>% count(type)
## type n
## 1 equal 7856
## 2 pitcher first 42628
## 3 two way 884
cond %>% filter(HR_bat > HR_pitch) %>% mutate(diff = HR_bat - HR_pitch) %>% arrange(desc(diff)) %>% slice_head(n = 5) %>% select(HR_bat)
## HR_bat
## 1 59
## 2 54
## 3 49
## 4 41
## 5 35
tway <- cond %>% filter(type == "two way") %>% distinct(playerID) %>% left_join(People, by = "playerID")
nrow(tway)
## [1] 617
sum(is.na(tway$deathYear))
## [1] 333
Uncommon = suppressWarnings(invisible(People %>% filter(playerID %in% (inner_join(Batting, Pitching, by = "playerID", relationship = "many-to-many") %>% distinct(playerID))$playerID))) %>% filter(!is.na(birthMonth)) %>% count(birthMonth, sort = TRUE) %>% slice_tail(n = 1) %>% pull(birthMonth) %>% print()
## [1] 6
Batting %>% filter(playerID == "ohtansh01") %>% select(yearID, teamID, stint, Batting = HR) %>% left_join(Pitching %>% filter(playerID == "ohtansh01") %>% select(yearID, teamID, stint, Pitching = HR), by = c("yearID", "teamID", "stint")) %>% replace_na(list(Pitching = 0)) %>% arrange(yearID)
## yearID teamID stint Batting Pitching
## 1 2018 LAA 1 22 6
## 2 2019 LAA 1 18 0
## 3 2020 LAA 1 7 0
## 4 2021 LAA 1 46 15
## 5 2022 LAA 1 34 14
## 6 2023 LAA 1 44 18
Part 2
return <- expand.grid(origin = c("EWR","JFK","LGA"), date = seq(as.Date("2013-01-01"), as.Date("2013-12-31"), "day"), hour = 0:23)
return %>% anti_join(weather %>% mutate(date = as.Date(time_hour)) %>% select(origin, date, hour), by = c("origin","date","hour")) %>% count(date, origin) %>% pivot_wider(names_from = origin, values_from = n) %>% arrange(date) %>% print(n = Inf)
## # A tibble: 25 × 4
## date EWR JFK LGA
## <date> <int> <int> <int>
## 1 2013-01-01 7 7 6
## 2 2013-01-06 NA NA 1
## 3 2013-02-18 1 NA NA
## 4 2013-02-20 1 NA NA
## 5 2013-02-21 1 1 1
## 6 2013-02-23 NA NA 1
## 7 2013-03-05 NA 1 1
## 8 2013-03-10 1 1 1
## 9 2013-04-03 NA 1 NA
## 10 2013-07-02 2 NA NA
## 11 2013-07-31 1 NA 1
## 12 2013-08-13 NA 1 1
## 13 2013-08-16 NA 1 1
## 14 2013-08-19 1 1 1
## 15 2013-08-22 1 1 NA
## 16 2013-08-23 2 2 2
## 17 2013-09-02 1 NA NA
## 18 2013-10-23 2 NA NA
## 19 2013-10-26 5 5 5
## 20 2013-10-27 1 1 1
## 21 2013-11-01 NA 2 2
## 22 2013-11-03 6 6 6
## 23 2013-11-04 1 1 1
## 24 2013-12-17 1 NA NA
## 25 2013-12-31 24 24 24