#install.packages("Lahman")
library(tidyverse)
library(Lahman)
data(People)
head(People)
## playerID birthYear birthMonth birthDay birthCountry birthState birthCity
## 1 aardsda01 1981 12 27 USA CO Denver
## 2 aaronha01 1934 2 5 USA AL Mobile
## 3 aaronto01 1939 8 5 USA AL Mobile
## 4 aasedo01 1954 9 8 USA CA Orange
## 5 abadan01 1972 8 25 USA FL Palm Beach
## 6 abadfe01 1985 12 17 D.R. La Romana La Romana
## deathYear deathMonth deathDay deathCountry deathState deathCity nameFirst
## 1 NA NA NA <NA> <NA> <NA> David
## 2 2021 1 22 USA GA Atlanta Hank
## 3 1984 8 16 USA GA Atlanta Tommie
## 4 NA NA NA <NA> <NA> <NA> Don
## 5 NA NA NA <NA> <NA> <NA> Andy
## 6 NA NA NA <NA> <NA> <NA> Fernando
## nameLast nameGiven weight height bats throws debut finalGame
## 1 Aardsma David Allan 215 75 R R 2004-04-06 2015-08-23
## 2 Aaron Henry Louis 180 72 R R 1954-04-13 1976-10-03
## 3 Aaron Tommie Lee 190 75 R R 1962-04-10 1971-09-26
## 4 Aase Donald William 190 75 R R 1977-07-26 1990-10-03
## 5 Abad Fausto Andres 184 73 L L 2001-09-10 2006-04-13
## 6 Abad Fernando Antonio 235 74 L L 2010-07-28 2019-09-28
## retroID bbrefID deathDate birthDate
## 1 aardd001 aardsda01 <NA> 1981-12-27
## 2 aaroh101 aaronha01 2021-01-22 1934-02-05
## 3 aarot101 aaronto01 1984-08-16 1939-08-05
## 4 aased001 aasedo01 <NA> 1954-09-08
## 5 abada001 abadan01 <NA> 1972-08-25
## 6 abadf001 abadfe01 <NA> 1985-12-17
#?People
usplayers <- People %>%
filter(birthCountry == "USA") %>%
group_by(birthState)%>%
summarize(n=n())
usplayers
## # A tibble: 51 x 2
## birthState n
## <chr> <int>
## 1 AK 12
## 2 AL 344
## 3 AR 160
## 4 AZ 121
## 5 CA 2348
## 6 CO 98
## 7 CT 209
## 8 DC 104
## 9 DE 57
## 10 FL 581
## # ... with 41 more rows
The most players came from California. 137 players came from Oregon.
shortplayers <- People %>%
filter(height < 60)
shortplayers
## playerID birthYear birthMonth birthDay birthCountry birthState birthCity
## 1 gaedeed01 1925 6 8 USA IL Chicago
## 2 healeto01 1853 NA NA USA RI Cranston
## deathYear deathMonth deathDay deathCountry deathState deathCity nameFirst
## 1 1961 6 18 USA IL Chicago Eddie
## 2 1891 2 6 USA ME Lewiston Tom
## nameLast nameGiven weight height bats throws debut finalGame retroID
## 1 Gaedel Edward Carl 65 43 R L 1951-08-19 1951-08-19 gaede101
## 2 Healey Thomas F. 155 55 <NA> R 1878-06-13 1878-09-09 healt101
## bbrefID deathDate birthDate
## 1 gaedeed01 1961-06-18 1925-06-08
## 2 healeto01 1891-02-06 <NA>
There are two players that are less tan 60 inches tall- Eddie Gaedel and Tom Healey.
bathand <- People %>%
summarize(left = mean(bats == "L", na.rm = TRUE), right = mean(bats == "R", na.rm = TRUE), both = mean(bats == "B", na.rm = TRUE))
bathand
## left right both
## 1 0.2774282 0.65759 0.06498176
27.743 % of players bat left handed, 65.759% of players bat right handed, and 6.498% of players bat with both hands
heightvsthrow <- People %>%
group_by(bats)%>%
summarize(avgheight = mean(height, na.rm = TRUE))
heightvsthrow
## # A tibble: 4 x 2
## bats avgheight
## <fct> <dbl>
## 1 B 71.7
## 2 L 72.3
## 3 R 72.6
## 4 <NA> 69.5
On average, people who throw with their right hand are slightly taller than people who throw with their left hand
avgheight_country <- People %>%
group_by(birthCountry)%>%
summarize(avgheight = mean(height, na.rm = TRUE))%>%
arrange(desc(avgheight))
head(avgheight_country)
## # A tibble: 6 x 2
## birthCountry avgheight
## <chr> <dbl>
## 1 Indonesia 78
## 2 Belgium 77
## 3 Hong Kong 76
## 4 Jamaica 75.2
## 5 Afghanistan 75
## 6 Lithuania 75
tail(avgheight_country)
## # A tibble: 6 x 2
## birthCountry avgheight
## <chr> <dbl>
## 1 Belize 70
## 2 <NA> 69.6
## 3 Ireland 69.6
## 4 Finland 69
## 5 Denmark 67
## 6 Portugal 65
Indonesia has the tallest players on average, while Portugal has the shortest players on average.
People2 <- People %>%
mutate(BMI = (weight*703)/(height**2))
head(People2)
## playerID birthYear birthMonth birthDay birthCountry birthState birthCity
## 1 aardsda01 1981 12 27 USA CO Denver
## 2 aaronha01 1934 2 5 USA AL Mobile
## 3 aaronto01 1939 8 5 USA AL Mobile
## 4 aasedo01 1954 9 8 USA CA Orange
## 5 abadan01 1972 8 25 USA FL Palm Beach
## 6 abadfe01 1985 12 17 D.R. La Romana La Romana
## deathYear deathMonth deathDay deathCountry deathState deathCity nameFirst
## 1 NA NA NA <NA> <NA> <NA> David
## 2 2021 1 22 USA GA Atlanta Hank
## 3 1984 8 16 USA GA Atlanta Tommie
## 4 NA NA NA <NA> <NA> <NA> Don
## 5 NA NA NA <NA> <NA> <NA> Andy
## 6 NA NA NA <NA> <NA> <NA> Fernando
## nameLast nameGiven weight height bats throws debut finalGame
## 1 Aardsma David Allan 215 75 R R 2004-04-06 2015-08-23
## 2 Aaron Henry Louis 180 72 R R 1954-04-13 1976-10-03
## 3 Aaron Tommie Lee 190 75 R R 1962-04-10 1971-09-26
## 4 Aase Donald William 190 75 R R 1977-07-26 1990-10-03
## 5 Abad Fausto Andres 184 73 L L 2001-09-10 2006-04-13
## 6 Abad Fernando Antonio 235 74 L L 2010-07-28 2019-09-28
## retroID bbrefID deathDate birthDate BMI
## 1 aardd001 aardsda01 <NA> 1981-12-27 26.87022
## 2 aaroh101 aaronha01 2021-01-22 1934-02-05 24.40972
## 3 aarot101 aaronto01 1984-08-16 1939-08-05 23.74578
## 4 aased001 aasedo01 <NA> 1954-09-08 23.74578
## 5 abada001 abadan01 <NA> 1972-08-25 24.27322
## 6 abadf001 abadfe01 <NA> 1985-12-17 30.16892
obese <- People2%>%
filter(BMI > 30)%>%
arrange(desc(BMI))
head(obese)
## playerID birthYear birthMonth birthDay birthCountry birthState
## 1 kirkal01 1998 11 6 Mexico Baja California
## 2 colonba01 1973 5 24 D.R. Puerto Plata
## 3 sandopa01 1986 8 11 Venezuela Carabobo
## 4 fieldpr01 1984 5 9 USA CA
## 5 diazju03 1984 2 27 D.R. La Romana
## 6 moronre01 1993 1 6 D.R. Santiago
## birthCity deathYear deathMonth deathDay deathCountry deathState
## 1 Tijuana NA NA NA <NA> <NA>
## 2 Altamira NA NA NA <NA> <NA>
## 3 Puerto Cabello NA NA NA <NA> <NA>
## 4 Ontario NA NA NA <NA> <NA>
## 5 La Romana NA NA NA <NA> <NA>
## 6 Santiago NA NA NA <NA> <NA>
## deathCity nameFirst nameLast nameGiven weight height bats throws
## 1 <NA> Alejandro Kirk Alejandro 265 68 R R
## 2 <NA> Bartolo Colon Bartolo 285 71 R R
## 3 <NA> Pablo Sandoval Pablo Emilio 268 70 B R
## 4 <NA> Prince Fielder Prince Semien 275 71 L R
## 5 <NA> Jumbo Diaz Jose Rafael 315 76 R R
## 6 <NA> Reyes Moronta Reyes Armando 265 70 R R
## debut finalGame retroID bbrefID deathDate birthDate BMI
## 1 2020-09-12 2020-09-26 kirka001 kirkal01 <NA> 1998-11-06 40.28871
## 2 1997-04-04 2018-09-22 colob001 colonba01 <NA> 1973-05-24 39.74509
## 3 2008-08-14 2020-09-27 sandp001 sandopa01 <NA> 1986-08-11 38.44980
## 4 2005-06-13 2016-07-18 fielp001 fieldpr01 <NA> 1984-05-09 38.35053
## 5 2014-06-20 2017-07-16 diazj005 diazju03 <NA> 1984-02-27 38.33882
## 6 2017-09-05 2019-08-31 moror001 moronre01 <NA> 1993-01-06 38.01939
551 players are in this dataset. The player with the highest BMI is Kirk Alejandro.
BMI_bystatecountry <- People2 %>%
group_by(birthCountry, birthState)%>%
summarize(avgBMI = mean(BMI, na.rm = TRUE))
## `summarise()` has grouped output by 'birthCountry'. You can override using the `.groups` argument.
BMI_bystatecountry
## # A tibble: 338 x 3
## # Groups: birthCountry [58]
## birthCountry birthState avgBMI
## <chr> <chr> <dbl>
## 1 Afghanistan <NA> 26.9
## 2 American Samoa <NA> 28.5
## 3 Aruba <NA> 27.6
## 4 At Sea Atlantic Ocean 23.7
## 5 Australia Capital Territory 25.1
## 6 Australia New South Wales 26.7
## 7 Australia Queensland 25.1
## 8 Australia South Australia 24.9
## 9 Australia Victoria 25.7
## 10 Australia Western Australia 28.0
## # ... with 328 more rows
deceased = People %>%
filter(!is.na(deathDate))%>%
mutate(lifetime_years = as.numeric((deathDate - birthDate)/365))%>%
arrange(desc(lifetime_years))
head(deceased)
## playerID birthYear birthMonth birthDay birthCountry birthState
## 1 hoffch01 1891 5 8 USA NY
## 2 marreco01 1911 4 25 Cuba Villa Clara
## 3 wrighbo01 1891 12 13 USA IN
## 4 parkeac01 1912 5 17 USA VA
## 5 malinto01 1909 10 7 USA IL
## 6 swanska01 1900 12 17 USA IL
## birthCity deathYear deathMonth deathDay deathCountry deathState
## 1 Ossining 1998 9 17 USA FL
## 2 Sagua La Grande 2014 4 23 Cuba La Habana
## 3 Decatur County 1993 7 30 USA CA
## 4 Portsmouth 2013 11 6 USA VA
## 5 Collinsville 2011 2 8 USA CA
## 6 North Henderson 2002 4 3 USA IL
## deathCity nameFirst nameLast nameGiven weight height bats throws
## 1 Daytona Beach Red Hoff Chester Cornelius 162 69 L L
## 2 La Habana Connie Marrero Conrado Eugenio 158 65 R R
## 3 Carmichael Bob Wright Robert Cassius 175 73 R R
## 4 Portsmouth Ace Parker Clarence McKay 180 72 R R
## 5 Oxnard Tony Malinosky Anthony Francis 165 70 R R
## 6 Rock Island Karl Swanson Karl Edward 155 70 L R
## debut finalGame retroID bbrefID deathDate birthDate lifetime_years
## 1 1911-09-06 1915-10-02 hoffr102 hoffre01 1998-09-17 1891-05-08 107.4329
## 2 1950-04-21 1954-09-07 marrc101 marreco01 2014-04-23 1911-04-25 103.0658
## 3 1915-09-21 1915-09-24 wrigb101 wrighbo01 1993-07-30 1891-12-13 101.6959
## 4 1937-04-24 1938-09-04 parka102 parkeac01 2013-11-06 1912-05-17 101.5425
## 5 1937-04-26 1937-07-16 malit101 malinto01 2011-02-08 1909-10-07 101.4082
## 6 1928-08-12 1929-05-05 swank101 swanska01 2002-04-03 1900-12-17 101.3616
The baseball player that lived the longest was Red Hoff. He was 107.433 years old.
People3 <- People %>%
mutate(career_years = as.numeric((as.Date(finalGame) - as.Date(debut))/365))%>%
arrange(desc(career_years))
head(People3)
## playerID birthYear birthMonth birthDay birthCountry birthState birthCity
## 1 altroni01 1876 9 15 USA OH Cincinnati
## 2 orourji01 1850 9 1 USA CT Bridgeport
## 3 minosmi01 1925 11 29 Cuba La Habana La Habana
## 4 olearch01 1875 10 15 USA IL Chicago
## 5 lathaar01 1860 3 15 USA NH West Lebanon
## 6 mcguide01 1863 11 18 USA OH Youngstown
## deathYear deathMonth deathDay deathCountry deathState deathCity nameFirst
## 1 1965 1 20 USA DC Washington Nick
## 2 1919 1 8 USA CT Bridgeport Jim
## 3 2015 3 1 USA IL Chicago Minnie
## 4 1941 1 6 USA IL Chicago Charley
## 5 1952 11 29 USA NY Garden City Arlie
## 6 1936 10 31 USA MI Duck Lake Deacon
## nameLast nameGiven weight height bats throws debut
## 1 Altrock Nicholas 197 70 B L 1898-07-14
## 2 O'Rourke James Henry 185 68 R R 1872-04-26
## 3 Minoso Saturnino Orestes Armas 175 70 R R 1949-04-19
## 4 O'Leary Charles Timothy 165 67 R R 1904-04-14
## 5 Latham Walter Arlington 150 68 R R 1880-07-05
## 6 McGuire James Thomas 185 73 R R 1884-06-21
## finalGame retroID bbrefID deathDate birthDate career_years
## 1 1933-10-01 altrn101 altroni01 1965-01-20 1876-09-15 35.23836
## 2 1904-09-22 orouj103 o'rouji01 1919-01-08 1850-09-01 32.42740
## 3 1980-10-05 minom101 minosmi01 2015-03-01 1925-11-29 31.48493
## 4 1934-09-30 oleac101 o'leach01 1941-01-06 1875-10-15 30.48219
## 5 1909-09-30 latha101 lathaar01 1952-11-29 1860-03-15 29.25479
## 6 1912-05-18 mcgud101 mcguide01 1936-10-31 1863-11-18 27.92329
The player with the longest career was Nick Altrock, with a career of 35.238 years.
#install.packages("MASS")
library(MASS)
data(oats)
#?oats
each row of the dataset represents an observation - one case of one set of oat variety, treatment, and block.
the columns of the dataset represent the variables that are changed/measured every observation. The first column, B, represents the block that the oat plant was grown in. It is a categorical, ordinal variable. The second column, V, represents the variety of oat plant grown. It is a categorical variable, but it is not ordinal. The third column, N, represents the levels of nitrogen fertilizer used. It is a categorical, ordinal, variable. The final column, Y, is the yield of the plant. It is a numeric, continuous variable.
the explanatory variables in this study are the fertilizer levels and the oat variety. The response variable is the yield of the oat plant.
the higher the nitrogen fertilizer concentration level, the higher the yield will be.
plot1 = ggplot(oats, aes(N, Y, fill = N))+
geom_boxplot()
plot1
This plot shows that the median yield increases with increasing fertilizer concentration. The plots for each fertilizer concentrations have similar IQRs. These boxplots do appear to support my hypothesis.
plot2 = ggplot(oats, aes(V, Y, fill = V))+
geom_boxplot()
plot2
the three oat varieties have different median yields, with Victory having the lowest, and Marvellous having the highest. However, their maximum yields (not including outliers) are about the same. This plot doesn’t support or not support my hypothesis, due to the nature of my hypothesis only having to do with the nitrogen fertilizer levels of the oats, and not the varieties.
plot3 = ggplot(oats, aes(V, Y, fill = V))+
geom_boxplot()+
facet_grid(~N)
plot3
Overall, the data seems fairly consistent with the previous plots. Each oat variety seems to increase yield with increasing nitrogen concentration, and until the 0.6cwt concentration, the order of the median yields is consistent with plot 2. In this plot, the maximums of the three varieties are not consistent with each other. Overall, I would conclude that the data in this plot does support my hypothesis.
After exploring these data, I would tell the farmer that the 0.6cwt concentration of nitrogen fertilizer is generally the best for the yield of oats, and that the “victory” variety of oat is far underperforming the others, other than its outliers. For lower concentrations of fertilizer, the “marvellous” variety is generally the best in terms of yield, but at 0.6cwt, the “golden rain” variety has the highest median yield. Therefore, I would reccomend planting the golden rain oats and using a 0.6cwt concentration of the fertilizer in order to maximize yield.