#install.packages("Lahman")
library(Lahman)
library(tidyverse)
data(People)
#head(People)
#summary(People)
?People
Question 1:
Looking only at players from the United States, which state have the most baseball players come from? How many players come from Oregon? Create a new dataframe to accomplish this.
Highest_Player <-People %>%
filter(birthCountry == "USA")%>%
count(birthState,sort =TRUE,name ="playerCount")
#CA has the most players in baseball in this data set
Player_OR <- Highest_Player %>%
filter(birthState == "OR")
# There are 137 baseball players from Oregon.
rbind(Highest_Player[1,],Player_OR)
## birthState playerCount
## 1 CA 2348
## 2 OR 137
Answer 1: Looks like CA has the highest number of players and OR has 137 players.
Question 2:
Create a new data frame that only includes players less than 60 inches tall. Show the data set. How many players are there who are less than 60 inches tall? What are their names?
very_small <- People %>%
filter(height < 60,na.rm=TRUE)%>%
summarise(nameFirst,nameLast,height)
very_small
## nameFirst nameLast height
## 1 Eddie Gaedel 43
## 2 Tom Healey 55
Answer 2: Looks like Eddie Gaedel and Tom Healey were the shortest in the data set. Only 2? wow.
Question 3: What percentage of players bat left handed? Right handed? Both? Create a new dataframe to accomplish this.
bat_Hands <- People %>%
na.omit(People)%>%
count(bats,sort = TRUE, name = "pCount")%>%
mutate(bats,percentage = (pCount/sum(pCount)*100))
bat_Hands
## bats pCount percentage
## 1 R 5558 66.931599
## 2 L 2322 27.962428
## 3 B 424 5.105973
Answer 3: There are 5558 right handed players which is 67% of total players There are 2322 left handed players which is 28% of total players There are 424 both handed players which is 5% of the total players.
Question 4:
Which are taller on average: players who throw with their right hand or players with throw with their left? Create a new dataframe to accomplish this.
taller_Players <- People %>%
na.omit(People)%>%
group_by(throws)%>%
summarise(height_avg = mean(height))
taller_Players
## # A tibble: 2 × 2
## throws height_avg
## <fct> <dbl>
## 1 L 71.4
## 2 R 71.3
Answer 4: The average hieght for left handers is 71.4 inches tall the average height for right handers is 71.2 inches tall
Pretty close adverage if you ask me!
Question 5:
Make a dataframe that displays just the average height of players from each country. What country has the tallest players? What country has the shortest players? ● Hint: You can use the View() function to sort variables by a given column
avg_Hight_Tall <- People %>%
na.omit(People)%>%
group_by(birthCountry)%>%
summarise(tallest = mean(height),)%>%
arrange(desc(tallest))
avg_Hight_Short <- People %>%
na.omit(People)%>%
group_by(birthCountry)%>%
summarise(shortest = mean(height),)%>%
arrange(shortest)
cbind(avg_Hight_Tall[1,],avg_Hight_Short[1,])
## birthCountry tallest birthCountry shortest
## 1 Japan 76 Australia 67
Answer 5: The tallest avgerage height of players is Japan being at 76in The shortest average height of players is Austalia being at 67in
Question 6:
Add a new column to the data set for body mass index (BMI).
all_BMI <- People%>%
mutate(People,BMI = ((weight*703)/(height^2)))
head(all_BMI)
## playerID birthYear birthMonth birthDay birthCountry birthState birthCity
## 1 aardsda01 1981 12 27 USA CO Denver
## 2 aaronha01 1934 2 5 USA AL Mobile
## 3 aaronto01 1939 8 5 USA AL Mobile
## 4 aasedo01 1954 9 8 USA CA Orange
## 5 abadan01 1972 8 25 USA FL Palm Beach
## 6 abadfe01 1985 12 17 D.R. La Romana La Romana
## deathYear deathMonth deathDay deathCountry deathState deathCity nameFirst
## 1 NA NA NA <NA> <NA> <NA> David
## 2 2021 1 22 USA GA Atlanta Hank
## 3 1984 8 16 USA GA Atlanta Tommie
## 4 NA NA NA <NA> <NA> <NA> Don
## 5 NA NA NA <NA> <NA> <NA> Andy
## 6 NA NA NA <NA> <NA> <NA> Fernando
## nameLast nameGiven weight height bats throws debut finalGame
## 1 Aardsma David Allan 215 75 R R 2004-04-06 2015-08-23
## 2 Aaron Henry Louis 180 72 R R 1954-04-13 1976-10-03
## 3 Aaron Tommie Lee 190 75 R R 1962-04-10 1971-09-26
## 4 Aase Donald William 190 75 R R 1977-07-26 1990-10-03
## 5 Abad Fausto Andres 184 73 L L 2001-09-10 2006-04-13
## 6 Abad Fernando Antonio 235 74 L L 2010-07-28 2019-09-28
## retroID bbrefID deathDate birthDate BMI
## 1 aardd001 aardsda01 <NA> 1981-12-27 26.87022
## 2 aaroh101 aaronha01 2021-01-22 1934-02-05 24.40972
## 3 aarot101 aaronto01 1984-08-16 1939-08-05 23.74578
## 4 aased001 aasedo01 <NA> 1954-09-08 23.74578
## 5 abada001 abadan01 <NA> 1972-08-25 24.27322
## 6 abadf001 abadfe01 <NA> 1985-12-17 30.16892
Question 7:
The CDC (defines a BMI greater than 30 “within an obese range”. Create a dataframe of only the obese players. How many players are in this dataset? Which player has the highest BMI?
Obese <-all_BMI %>%
filter(BMI > 30)%>%
arrange(desc(BMI))%>%
summarise(nameFirst,nameLast,BMI)
head(Obese)
## nameFirst nameLast BMI
## 1 Alejandro Kirk 40.28871
## 2 Bartolo Colon 39.74509
## 3 Pablo Sandoval 38.44980
## 4 Prince Fielder 38.35053
## 5 Jumbo Diaz 38.33882
## 6 Reyes Moronta 38.01939
Answer 7: There are 551 players that are considered obese. The highest BMI player is Alejandro Kirk at 40.28 BMI.
Question 8:
Create a dataframe that compares the average body mass index (BMI) of players from each state and nation combination.
compare_BMI <- all_BMI %>%
na.omit(People)%>%
group_by(birthCountry, birthState)%>%
summarise(avg_BMI = mean(BMI))
## `summarise()` has grouped output by 'birthCountry'. You can override using the `.groups` argument.
head(compare_BMI)
## # A tibble: 6 × 3
## # Groups: birthCountry [3]
## birthCountry birthState avg_BMI
## <chr> <chr> <dbl>
## 1 Australia Queensland 24.7
## 2 Bahamas New Providence 24.8
## 3 CAN AB 23.8
## 4 CAN BC 25.0
## 5 CAN MB 23.9
## 6 CAN NB 25.5
Question 9:
Creating a dataset for only baseball players who have deceased, add a new column to the data set for a player’s lifetime (in years). You can assume that each year has 365 days. Which baseball player lived the longest? How many years old was he? ● Note that both birthDate and deathDate are as.Date variables. Subtracting as.Date variables will give the number of days between the dates.
lifeTime <- People %>%
group_by(nameFirst,nameLast, deathYear) %>%
summarise(life_Span_Year = (deathDate- birthDate)/365 ) %>%
arrange(desc(life_Span_Year))
## `summarise()` has grouped output by 'nameFirst', 'nameLast', 'deathYear'. You can override using the `.groups` argument.
lifeTime
## # A tibble: 20,093 × 4
## # Groups: nameFirst, nameLast, deathYear [19,876]
## nameFirst nameLast deathYear life_Span_Year
## <chr> <chr> <int> <drtn>
## 1 Red Hoff 1998 107.4329 days
## 2 Connie Marrero 2014 103.0658 days
## 3 Bob Wright 1993 101.6959 days
## 4 Ace Parker 2013 101.5425 days
## 5 Tony Malinosky 2011 101.4082 days
## 6 Karl Swanson 2002 101.3616 days
## 7 John Daley 1988 101.3370 days
## 8 Bill Otis 1990 101.0411 days
## 9 Rollie Stiles 2007 100.7452 days
## 10 Billy Werber 2009 100.6603 days
## # … with 20,083 more rows
Answer 9: Red Hoff died in 1998 at 107 years old!
Question 10:
Convert the debut and finalGame variables to dates using the as.Date() function. Add a new column to the data set for a player’s career (in years). You can assume that each year has 365 days. Which baseball player had the longest career? How long was it?
careerLife<-People%>%
mutate(People,career = (as.Date(finalGame)-as.Date(debut))/365)%>%
arrange(desc(career))%>%
summarise(nameFirst,nameLast,career)
head(careerLife)
## nameFirst nameLast career
## 1 Nick Altrock 35.23836 days
## 2 Jim O'Rourke 32.42740 days
## 3 Minnie Minoso 31.48493 days
## 4 Charley O'Leary 30.48219 days
## 5 Arlie Latham 29.25479 days
## 6 Deacon McGuire 27.92329 days
Answer 10: 35 years is the longest a baseball player has played in this database, his name is Nick Altrock