#install.packages("Lahman")
library(Lahman)
library(tidyverse)
data(People)
#head(People)
#summary(People)
?People

Question 1:

Looking only at players from the United States, which state have the most baseball players come from? How many players come from Oregon? Create a new dataframe to accomplish this.

Highest_Player <-People %>%
  filter(birthCountry == "USA")%>%
  count(birthState,sort =TRUE,name ="playerCount")

  
#CA has the most players in baseball in this data set

Player_OR <- Highest_Player %>%
    filter(birthState == "OR")
# There are 137 baseball players from Oregon.


rbind(Highest_Player[1,],Player_OR)
##   birthState playerCount
## 1         CA        2348
## 2         OR         137

Question 2:

Create a new data frame that only includes players less than 60 inches tall. Show the data set. How many players are there who are less than 60 inches tall? What are their names?

na.rm=TRUE
very_small <- People %>%
  filter(height < 60)%>%
  summarise(nameFirst,nameLast,height)
very_small
##   nameFirst nameLast height
## 1     Eddie   Gaedel     43
## 2       Tom   Healey     55

Looks like Eddie Gaedel and Tom Healey were the shortest in the data set. Only 2? wow.

Question 3:

What percentage of players bat left handed? Right handed? Both? Create a new dataframe to accomplish this.

bat_Hands <- People %>%
  na.omit(People)%>%
  count(bats,sort = TRUE, name = "pCount")%>%
  
  mutate(bats,percentage = (pCount/sum(pCount)*100))
  
bat_Hands
##   bats pCount percentage
## 1    R   5558  66.931599
## 2    L   2322  27.962428
## 3    B    424   5.105973

Question 4:

Which are taller on average: players who throw with their right hand or players with throw with their left? Create a new dataframe to accomplish this.

taller_Players <- People %>%
  na.omit(People)%>%
  group_by(throws)%>%
  summarise(height_avg = mean(height))
  
taller_Players
## # A tibble: 2 × 2
##   throws height_avg
##   <fct>       <dbl>
## 1 L            71.4
## 2 R            71.3

Question 5:

Make a dataframe that displays just the average height of players from each country. What country has the tallest players? What country has the shortest players? ● Hint: You can use the View() function to sort variables by a given column

avg_Hight_Tall <- People %>%
  na.omit(People)%>%
  group_by(birthCountry)%>%
  summarise(tallest = mean(height),)%>%
  arrange(desc(tallest,.by_group = TRUE))

avg_Hight_Short <- People %>%
  na.omit(People)%>%
  group_by(birthCountry)%>%
  summarise(shortest = mean(height),)%>%
  arrange(shortest,.by_group = TRUE)

cbind(avg_Hight_Tall[1,],avg_Hight_Short[1,])
##   birthCountry tallest birthCountry shortest
## 1        Japan      76    Australia       67

Question 6:

Add a new column to the data set for body mass index (BMI).

all_BMI <- People%>%
  mutate(People,BMI = ((weight*703)/(height^2)))

head(all_BMI)
##    playerID birthYear birthMonth birthDay birthCountry birthState  birthCity
## 1 aardsda01      1981         12       27          USA         CO     Denver
## 2 aaronha01      1934          2        5          USA         AL     Mobile
## 3 aaronto01      1939          8        5          USA         AL     Mobile
## 4  aasedo01      1954          9        8          USA         CA     Orange
## 5  abadan01      1972          8       25          USA         FL Palm Beach
## 6  abadfe01      1985         12       17         D.R.  La Romana  La Romana
##   deathYear deathMonth deathDay deathCountry deathState deathCity nameFirst
## 1        NA         NA       NA         <NA>       <NA>      <NA>     David
## 2      2021          1       22          USA         GA   Atlanta      Hank
## 3      1984          8       16          USA         GA   Atlanta    Tommie
## 4        NA         NA       NA         <NA>       <NA>      <NA>       Don
## 5        NA         NA       NA         <NA>       <NA>      <NA>      Andy
## 6        NA         NA       NA         <NA>       <NA>      <NA>  Fernando
##   nameLast        nameGiven weight height bats throws      debut  finalGame
## 1  Aardsma      David Allan    215     75    R      R 2004-04-06 2015-08-23
## 2    Aaron      Henry Louis    180     72    R      R 1954-04-13 1976-10-03
## 3    Aaron       Tommie Lee    190     75    R      R 1962-04-10 1971-09-26
## 4     Aase   Donald William    190     75    R      R 1977-07-26 1990-10-03
## 5     Abad    Fausto Andres    184     73    L      L 2001-09-10 2006-04-13
## 6     Abad Fernando Antonio    235     74    L      L 2010-07-28 2019-09-28
##    retroID   bbrefID  deathDate  birthDate      BMI
## 1 aardd001 aardsda01       <NA> 1981-12-27 26.87022
## 2 aaroh101 aaronha01 2021-01-22 1934-02-05 24.40972
## 3 aarot101 aaronto01 1984-08-16 1939-08-05 23.74578
## 4 aased001  aasedo01       <NA> 1954-09-08 23.74578
## 5 abada001  abadan01       <NA> 1972-08-25 24.27322
## 6 abadf001  abadfe01       <NA> 1985-12-17 30.16892

#Question 7:

The CDC ( defines a BMI greater than 30 “within an obese range”. Create a dataframe of only the obese players. How many players are in this dataset? Which player has the highest BMI?

Obese <-all_BMI %>%
  filter(BMI > 30)%>%
  arrange(desc(BMI),.by_group =TRUE)%>%
  summarise(nameFirst,nameLast,BMI)

head(Obese)
##   nameFirst nameLast      BMI
## 1 Alejandro     Kirk 40.28871
## 2   Bartolo    Colon 39.74509
## 3     Pablo Sandoval 38.44980
## 4    Prince  Fielder 38.35053
## 5     Jumbo     Diaz 38.33882
## 6     Reyes  Moronta 38.01939

There are 551 players that are considered obese. The highest BMI player is Alejandro Kirk at 40.28 BMI.

#Question 8:

Create a dataframe that compares the average body mass index (BMI) of players from each state and nation combination.

compare_BMI <- all_BMI %>%
  na.omit(People)%>%
  group_by(birthCountry, birthState)%>%
  summarise(avg_BMI = mean(BMI))
## `summarise()` has grouped output by 'birthCountry'. You can override using the `.groups` argument.
head(compare_BMI)
## # A tibble: 6 × 3
## # Groups:   birthCountry [3]
##   birthCountry birthState     avg_BMI
##   <chr>        <chr>            <dbl>
## 1 Australia    Queensland        24.7
## 2 Bahamas      New Providence    24.8
## 3 CAN          AB                23.8
## 4 CAN          BC                25.0
## 5 CAN          MB                23.9
## 6 CAN          NB                25.5

#Question 9:

Creating a dataset for only baseball players who have deceased, add a new column to the data set for a player’s lifetime (in years). You can assume that each year has 365 days. Which baseball player lived the longest? How many years old was he? ● Note that both birthDate and deathDate are as.Date variables. Subtracting as.Date variables will give the number of days between the dates.

lifeTime <- People %>%
  group_by(nameFirst,nameLast,deathYear) %>%
  summarise(life_Span_Year = (deathDate- birthDate)/365 ) %>%
  arrange(desc(life_Span_Year)) 
## `summarise()` has grouped output by 'nameFirst', 'nameLast', 'deathYear'. You can override using the `.groups` argument.
head(lifeTime)
## # A tibble: 6 × 4
## # Groups:   nameFirst, nameLast, deathYear [6]
##   nameFirst nameLast  deathYear life_Span_Year
##   <chr>     <chr>         <int> <drtn>        
## 1 Red       Hoff           1998 107.4329 days 
## 2 Connie    Marrero        2014 103.0658 days 
## 3 Bob       Wright         1993 101.6959 days 
## 4 Ace       Parker         2013 101.5425 days 
## 5 Tony      Malinosky      2011 101.4082 days 
## 6 Karl      Swanson        2002 101.3616 days

Red Hoff died in 1998 at 107 years old!

#Question 10:

Convert the debut and finalGame variables to dates using the as.Date() function. Add a new column to the data set for a player’s career (in years). You can assume that each year has 365 days. Which baseball player had the longest career? How long was it?

 careerLife<-People%>%
  mutate(People,career = (as.Date(finalGame)-as.Date(debut))/365)%>%
  arrange(desc(career))%>%
  summarise(nameFirst,nameLast,career)
head(careerLife)
##   nameFirst nameLast        career
## 1      Nick  Altrock 35.23836 days
## 2       Jim O'Rourke 32.42740 days
## 3    Minnie   Minoso 31.48493 days
## 4   Charley  O'Leary 30.48219 days
## 5     Arlie   Latham 29.25479 days
## 6    Deacon  McGuire 27.92329 days

35 years is the longest a baseball player has played in this database, his name is Nick Altrock