#install.packages("Lahman")
library(Lahman)
library(tidyverse)
data(People)
#head(People)
#summary(People)
?People

Question 1:

Looking only at players from the United States, which state have the most baseball players come from? How many players come from Oregon? Create a new dataframe to accomplish this.

Highest_Player <-People %>%
  filter(birthCountry == "USA")%>%
  count(birthState,sort =TRUE,name ="playerCount")

  
#CA has the most players in baseball in this data set

Player_OR <- Highest_Player %>%
    filter(birthState == "OR")
# There are 137 baseball players from Oregon.


rbind(Highest_Player[1,],Player_OR)
##   birthState playerCount
## 1         CA        2348
## 2         OR         137

Answer 1: Looks like CA has the highest number of players and OR has 137 players.

Question 2:

Create a new data frame that only includes players less than 60 inches tall. Show the data set. How many players are there who are less than 60 inches tall? What are their names?

very_small <- People %>%
  filter(height < 60,na.rm=TRUE)%>%
  summarise(nameFirst,nameLast,height)
very_small
##   nameFirst nameLast height
## 1     Eddie   Gaedel     43
## 2       Tom   Healey     55

Answer 2: Looks like Eddie Gaedel and Tom Healey were the shortest in the data set. Only 2? wow.

Question 3: What percentage of players bat left handed? Right handed? Both? Create a new dataframe to accomplish this.

bat_Hands <- People %>%
  na.omit(People)%>%
  count(bats,sort = TRUE, name = "pCount")%>%
  
  mutate(bats,percentage = (pCount/sum(pCount)*100))
  
bat_Hands
##   bats pCount percentage
## 1    R   5558  66.931599
## 2    L   2322  27.962428
## 3    B    424   5.105973

Answer 3: There are 5558 right handed players which is 67% of total players There are 2322 left handed players which is 28% of total players There are 424 both handed players which is 5% of the total players.

Question 4:

Which are taller on average: players who throw with their right hand or players with throw with their left? Create a new dataframe to accomplish this.

taller_Players <- People %>%
  na.omit(People)%>%
  group_by(throws)%>%
  summarise(height_avg = mean(height))
  
taller_Players
## # A tibble: 2 × 2
##   throws height_avg
##   <fct>       <dbl>
## 1 L            71.4
## 2 R            71.3

Answer 4: The average hieght for left handers is 71.4 inches tall the average height for right handers is 71.2 inches tall

Pretty close adverage if you ask me!

Question 5:

Make a dataframe that displays just the average height of players from each country. What country has the tallest players? What country has the shortest players? ● Hint: You can use the View() function to sort variables by a given column

avg_Hight_Tall <- People %>%
  na.omit(People)%>%
  group_by(birthCountry)%>%
  summarise(tallest = mean(height),)%>%
  arrange(desc(tallest))

avg_Hight_Short <- People %>%
  na.omit(People)%>%
  group_by(birthCountry)%>%
  summarise(shortest = mean(height),)%>%
  arrange(shortest)

cbind(avg_Hight_Tall[1,],avg_Hight_Short[1,])
##   birthCountry tallest birthCountry shortest
## 1        Japan      76    Australia       67

Answer 5: The tallest avgerage height of players is Japan being at 76in The shortest average height of players is Austalia being at 67in

Question 6:

Add a new column to the data set for body mass index (BMI).

all_BMI <- People%>%
  mutate(People,BMI = ((weight*703)/(height^2)))

head(all_BMI)
##    playerID birthYear birthMonth birthDay birthCountry birthState  birthCity
## 1 aardsda01      1981         12       27          USA         CO     Denver
## 2 aaronha01      1934          2        5          USA         AL     Mobile
## 3 aaronto01      1939          8        5          USA         AL     Mobile
## 4  aasedo01      1954          9        8          USA         CA     Orange
## 5  abadan01      1972          8       25          USA         FL Palm Beach
## 6  abadfe01      1985         12       17         D.R.  La Romana  La Romana
##   deathYear deathMonth deathDay deathCountry deathState deathCity nameFirst
## 1        NA         NA       NA         <NA>       <NA>      <NA>     David
## 2      2021          1       22          USA         GA   Atlanta      Hank
## 3      1984          8       16          USA         GA   Atlanta    Tommie
## 4        NA         NA       NA         <NA>       <NA>      <NA>       Don
## 5        NA         NA       NA         <NA>       <NA>      <NA>      Andy
## 6        NA         NA       NA         <NA>       <NA>      <NA>  Fernando
##   nameLast        nameGiven weight height bats throws      debut  finalGame
## 1  Aardsma      David Allan    215     75    R      R 2004-04-06 2015-08-23
## 2    Aaron      Henry Louis    180     72    R      R 1954-04-13 1976-10-03
## 3    Aaron       Tommie Lee    190     75    R      R 1962-04-10 1971-09-26
## 4     Aase   Donald William    190     75    R      R 1977-07-26 1990-10-03
## 5     Abad    Fausto Andres    184     73    L      L 2001-09-10 2006-04-13
## 6     Abad Fernando Antonio    235     74    L      L 2010-07-28 2019-09-28
##    retroID   bbrefID  deathDate  birthDate      BMI
## 1 aardd001 aardsda01       <NA> 1981-12-27 26.87022
## 2 aaroh101 aaronha01 2021-01-22 1934-02-05 24.40972
## 3 aarot101 aaronto01 1984-08-16 1939-08-05 23.74578
## 4 aased001  aasedo01       <NA> 1954-09-08 23.74578
## 5 abada001  abadan01       <NA> 1972-08-25 24.27322
## 6 abadf001  abadfe01       <NA> 1985-12-17 30.16892

Question 7:

The CDC (defines a BMI greater than 30 “within an obese range”. Create a dataframe of only the obese players. How many players are in this dataset? Which player has the highest BMI?

Obese <-all_BMI %>%
  filter(BMI > 30)%>%
  arrange(desc(BMI))%>%
  summarise(nameFirst,nameLast,BMI)

head(Obese)
##   nameFirst nameLast      BMI
## 1 Alejandro     Kirk 40.28871
## 2   Bartolo    Colon 39.74509
## 3     Pablo Sandoval 38.44980
## 4    Prince  Fielder 38.35053
## 5     Jumbo     Diaz 38.33882
## 6     Reyes  Moronta 38.01939

Answer 7: There are 551 players that are considered obese. The highest BMI player is Alejandro Kirk at 40.28 BMI.

Question 8:

Create a dataframe that compares the average body mass index (BMI) of players from each state and nation combination.

compare_BMI <- all_BMI %>%
  na.omit(People)%>%
  group_by(birthCountry, birthState)%>%
  summarise(avg_BMI = mean(BMI))
## `summarise()` has grouped output by 'birthCountry'. You can override using the `.groups` argument.
head(compare_BMI)
## # A tibble: 6 × 3
## # Groups:   birthCountry [3]
##   birthCountry birthState     avg_BMI
##   <chr>        <chr>            <dbl>
## 1 Australia    Queensland        24.7
## 2 Bahamas      New Providence    24.8
## 3 CAN          AB                23.8
## 4 CAN          BC                25.0
## 5 CAN          MB                23.9
## 6 CAN          NB                25.5

Question 9:

Creating a dataset for only baseball players who have deceased, add a new column to the data set for a player’s lifetime (in years). You can assume that each year has 365 days. Which baseball player lived the longest? How many years old was he? ● Note that both birthDate and deathDate are as.Date variables. Subtracting as.Date variables will give the number of days between the dates.

lifeTime <- People %>%
  group_by(nameFirst,nameLast, deathYear) %>%
  summarise(life_Span_Year = (deathDate- birthDate)/365 ) %>%
  arrange(desc(life_Span_Year)) 
## `summarise()` has grouped output by 'nameFirst', 'nameLast', 'deathYear'. You can override using the `.groups` argument.
lifeTime
## # A tibble: 20,093 × 4
## # Groups:   nameFirst, nameLast, deathYear [19,876]
##    nameFirst nameLast  deathYear life_Span_Year
##    <chr>     <chr>         <int> <drtn>        
##  1 Red       Hoff           1998 107.4329 days 
##  2 Connie    Marrero        2014 103.0658 days 
##  3 Bob       Wright         1993 101.6959 days 
##  4 Ace       Parker         2013 101.5425 days 
##  5 Tony      Malinosky      2011 101.4082 days 
##  6 Karl      Swanson        2002 101.3616 days 
##  7 John      Daley          1988 101.3370 days 
##  8 Bill      Otis           1990 101.0411 days 
##  9 Rollie    Stiles         2007 100.7452 days 
## 10 Billy     Werber         2009 100.6603 days 
## # … with 20,083 more rows

Answer 9: Red Hoff died in 1998 at 107 years old!

Question 10:

Convert the debut and finalGame variables to dates using the as.Date() function. Add a new column to the data set for a player’s career (in years). You can assume that each year has 365 days. Which baseball player had the longest career? How long was it?

 careerLife<-People%>%
  mutate(People,career = (as.Date(finalGame)-as.Date(debut))/365)%>%
  arrange(desc(career))%>%
  summarise(nameFirst,nameLast,career)
head(careerLife)
##   nameFirst nameLast        career
## 1      Nick  Altrock 35.23836 days
## 2       Jim O'Rourke 32.42740 days
## 3    Minnie   Minoso 31.48493 days
## 4   Charley  O'Leary 30.48219 days
## 5     Arlie   Latham 29.25479 days
## 6    Deacon  McGuire 27.92329 days

Answer 10: 35 years is the longest a baseball player has played in this database, his name is Nick Altrock