#install.packages("Lahman")
library(Lahman)
library(tidyverse)
data(People)
#head(People)
#summary(People)
?People
Question 1:
Looking only at players from the United States, which state have the most baseball players come from? How many players come from Oregon? Create a new dataframe to accomplish this.
Highest_Player <-People %>%
filter(birthCountry == "USA")%>%
count(birthState,sort =TRUE,name ="playerCount")
#CA has the most players in baseball in this data set
Player_OR <- Highest_Player %>%
filter(birthState == "OR")
# There are 137 baseball players from Oregon.
rbind(Highest_Player[1,],Player_OR)
## birthState playerCount
## 1 CA 2348
## 2 OR 137
Question 2:
Create a new data frame that only includes players less than 60 inches tall. Show the data set. How many players are there who are less than 60 inches tall? What are their names?
na.rm=TRUE
very_small <- People %>%
filter(height < 60)%>%
summarise(nameFirst,nameLast,height)
very_small
## nameFirst nameLast height
## 1 Eddie Gaedel 43
## 2 Tom Healey 55
Looks like Eddie Gaedel and Tom Healey were the shortest in the data set. Only 2? wow.
Question 3:
What percentage of players bat left handed? Right handed? Both? Create a new dataframe to accomplish this.
bat_Hands <- People %>%
na.omit(People)%>%
count(bats,sort = TRUE, name = "pCount")%>%
mutate(bats,percentage = (pCount/sum(pCount)*100))
bat_Hands
## bats pCount percentage
## 1 R 5558 66.931599
## 2 L 2322 27.962428
## 3 B 424 5.105973
Question 4:
Which are taller on average: players who throw with their right hand or players with throw with their left? Create a new dataframe to accomplish this.
taller_Players <- People %>%
na.omit(People)%>%
group_by(throws)%>%
summarise(height_avg = mean(height))
taller_Players
## # A tibble: 2 × 2
## throws height_avg
## <fct> <dbl>
## 1 L 71.4
## 2 R 71.3
Question 5:
Make a dataframe that displays just the average height of players from each country. What country has the tallest players? What country has the shortest players? ● Hint: You can use the View() function to sort variables by a given column
avg_Hight_Tall <- People %>%
na.omit(People)%>%
group_by(birthCountry)%>%
summarise(tallest = mean(height),)%>%
arrange(desc(tallest,.by_group = TRUE))
avg_Hight_Short <- People %>%
na.omit(People)%>%
group_by(birthCountry)%>%
summarise(shortest = mean(height),)%>%
arrange(shortest,.by_group = TRUE)
cbind(avg_Hight_Tall[1,],avg_Hight_Short[1,])
## birthCountry tallest birthCountry shortest
## 1 Japan 76 Australia 67
Question 6:
Add a new column to the data set for body mass index (BMI).
all_BMI <- People%>%
mutate(People,BMI = ((weight*703)/(height^2)))
head(all_BMI)
## playerID birthYear birthMonth birthDay birthCountry birthState birthCity
## 1 aardsda01 1981 12 27 USA CO Denver
## 2 aaronha01 1934 2 5 USA AL Mobile
## 3 aaronto01 1939 8 5 USA AL Mobile
## 4 aasedo01 1954 9 8 USA CA Orange
## 5 abadan01 1972 8 25 USA FL Palm Beach
## 6 abadfe01 1985 12 17 D.R. La Romana La Romana
## deathYear deathMonth deathDay deathCountry deathState deathCity nameFirst
## 1 NA NA NA <NA> <NA> <NA> David
## 2 2021 1 22 USA GA Atlanta Hank
## 3 1984 8 16 USA GA Atlanta Tommie
## 4 NA NA NA <NA> <NA> <NA> Don
## 5 NA NA NA <NA> <NA> <NA> Andy
## 6 NA NA NA <NA> <NA> <NA> Fernando
## nameLast nameGiven weight height bats throws debut finalGame
## 1 Aardsma David Allan 215 75 R R 2004-04-06 2015-08-23
## 2 Aaron Henry Louis 180 72 R R 1954-04-13 1976-10-03
## 3 Aaron Tommie Lee 190 75 R R 1962-04-10 1971-09-26
## 4 Aase Donald William 190 75 R R 1977-07-26 1990-10-03
## 5 Abad Fausto Andres 184 73 L L 2001-09-10 2006-04-13
## 6 Abad Fernando Antonio 235 74 L L 2010-07-28 2019-09-28
## retroID bbrefID deathDate birthDate BMI
## 1 aardd001 aardsda01 <NA> 1981-12-27 26.87022
## 2 aaroh101 aaronha01 2021-01-22 1934-02-05 24.40972
## 3 aarot101 aaronto01 1984-08-16 1939-08-05 23.74578
## 4 aased001 aasedo01 <NA> 1954-09-08 23.74578
## 5 abada001 abadan01 <NA> 1972-08-25 24.27322
## 6 abadf001 abadfe01 <NA> 1985-12-17 30.16892
#Question 7:
The CDC ( defines a BMI greater than 30 “within an obese range”. Create a dataframe of only the obese players. How many players are in this dataset? Which player has the highest BMI?
Obese <-all_BMI %>%
filter(BMI > 30)%>%
arrange(desc(BMI),.by_group =TRUE)%>%
summarise(nameFirst,nameLast,BMI)
head(Obese)
## nameFirst nameLast BMI
## 1 Alejandro Kirk 40.28871
## 2 Bartolo Colon 39.74509
## 3 Pablo Sandoval 38.44980
## 4 Prince Fielder 38.35053
## 5 Jumbo Diaz 38.33882
## 6 Reyes Moronta 38.01939
There are 551 players that are considered obese. The highest BMI player is Alejandro Kirk at 40.28 BMI.
#Question 8:
Create a dataframe that compares the average body mass index (BMI) of players from each state and nation combination.
compare_BMI <- all_BMI %>%
na.omit(People)%>%
group_by(birthCountry, birthState)%>%
summarise(avg_BMI = mean(BMI))
## `summarise()` has grouped output by 'birthCountry'. You can override using the `.groups` argument.
head(compare_BMI)
## # A tibble: 6 × 3
## # Groups: birthCountry [3]
## birthCountry birthState avg_BMI
## <chr> <chr> <dbl>
## 1 Australia Queensland 24.7
## 2 Bahamas New Providence 24.8
## 3 CAN AB 23.8
## 4 CAN BC 25.0
## 5 CAN MB 23.9
## 6 CAN NB 25.5
#Question 9:
Creating a dataset for only baseball players who have deceased, add a new column to the data set for a player’s lifetime (in years). You can assume that each year has 365 days. Which baseball player lived the longest? How many years old was he? ● Note that both birthDate and deathDate are as.Date variables. Subtracting as.Date variables will give the number of days between the dates.
lifeTime <- People %>%
group_by(nameFirst,nameLast,deathYear) %>%
summarise(life_Span_Year = (deathDate- birthDate)/365 ) %>%
arrange(desc(life_Span_Year))
## `summarise()` has grouped output by 'nameFirst', 'nameLast', 'deathYear'. You can override using the `.groups` argument.
head(lifeTime)
## # A tibble: 6 × 4
## # Groups: nameFirst, nameLast, deathYear [6]
## nameFirst nameLast deathYear life_Span_Year
## <chr> <chr> <int> <drtn>
## 1 Red Hoff 1998 107.4329 days
## 2 Connie Marrero 2014 103.0658 days
## 3 Bob Wright 1993 101.6959 days
## 4 Ace Parker 2013 101.5425 days
## 5 Tony Malinosky 2011 101.4082 days
## 6 Karl Swanson 2002 101.3616 days
Red Hoff died in 1998 at 107 years old!
#Question 10:
Convert the debut and finalGame variables to dates using the as.Date() function. Add a new column to the data set for a player’s career (in years). You can assume that each year has 365 days. Which baseball player had the longest career? How long was it?
careerLife<-People%>%
mutate(People,career = (as.Date(finalGame)-as.Date(debut))/365)%>%
arrange(desc(career))%>%
summarise(nameFirst,nameLast,career)
head(careerLife)
## nameFirst nameLast career
## 1 Nick Altrock 35.23836 days
## 2 Jim O'Rourke 32.42740 days
## 3 Minnie Minoso 31.48493 days
## 4 Charley O'Leary 30.48219 days
## 5 Arlie Latham 29.25479 days
## 6 Deacon McGuire 27.92329 days
35 years is the longest a baseball player has played in this database, his name is Nick Altrock