#PART I


####Calling packages we need 

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(dplyr)
library(ggplot2)
library(tidyr)
#### Import the package

#install.packages("Lahman")
library(Lahman)
#### Import the data for baseball players

data(People)
head(People)
##    playerID birthYear birthMonth birthDay birthCountry birthState  birthCity
## 1 aardsda01      1981         12       27          USA         CO     Denver
## 2 aaronha01      1934          2        5          USA         AL     Mobile
## 3 aaronto01      1939          8        5          USA         AL     Mobile
## 4  aasedo01      1954          9        8          USA         CA     Orange
## 5  abadan01      1972          8       25          USA         FL Palm Beach
## 6  abadfe01      1985         12       17         D.R.  La Romana  La Romana
##   deathYear deathMonth deathDay deathCountry deathState deathCity nameFirst
## 1        NA         NA       NA         <NA>       <NA>      <NA>     David
## 2      2021          1       22          USA         GA   Atlanta      Hank
## 3      1984          8       16          USA         GA   Atlanta    Tommie
## 4        NA         NA       NA         <NA>       <NA>      <NA>       Don
## 5        NA         NA       NA         <NA>       <NA>      <NA>      Andy
## 6        NA         NA       NA         <NA>       <NA>      <NA>  Fernando
##   nameLast        nameGiven weight height bats throws      debut  finalGame
## 1  Aardsma      David Allan    215     75    R      R 2004-04-06 2015-08-23
## 2    Aaron      Henry Louis    180     72    R      R 1954-04-13 1976-10-03
## 3    Aaron       Tommie Lee    190     75    R      R 1962-04-10 1971-09-26
## 4     Aase   Donald William    190     75    R      R 1977-07-26 1990-10-03
## 5     Abad    Fausto Andres    184     73    L      L 2001-09-10 2006-04-13
## 6     Abad Fernando Antonio    235     74    L      L 2010-07-28 2019-09-28
##    retroID   bbrefID  deathDate  birthDate
## 1 aardd001 aardsda01       <NA> 1981-12-27
## 2 aaroh101 aaronha01 2021-01-22 1934-02-05
## 3 aarot101 aaronto01 1984-08-16 1939-08-05
## 4 aased001  aasedo01       <NA> 1954-09-08
## 5 abada001  abadan01       <NA> 1972-08-25
## 6 abadf001  abadfe01       <NA> 1985-12-17
#### Learn more about the data

#?People
#str(People)
##QUESTION 1

PeopleUSA <- People %>%
  filter(birthCountry == "USA")

PeopleUSA %>%
  group_by(birthState)%>%
  summarize(n=n())%>%
  arrange(n)%>%
  print(n=51)
## # A tibble: 51 x 2
##    birthState     n
##    <chr>      <int>
##  1 AK            12
##  2 WY            16
##  3 ND            19
##  4 MT            26
##  5 ID            30
##  6 NM            31
##  7 VT            38
##  8 SD            39
##  9 UT            40
## 10 HI            46
## 11 NV            47
## 12 NH            56
## 13 DE            57
## 14 ME            79
## 15 RI            80
## 16 CO            98
## 17 DC           104
## 18 NE           115
## 19 AZ           121
## 20 WV           121
## 21 OR           137
## 22 AR           160
## 23 MN           172
## 24 SC           198
## 25 CT           209
## 26 MS           212
## 27 WA           212
## 28 KS           217
## 29 IA           223
## 30 WI           252
## 31 LA           263
## 32 OK           271
## 33 KY           290
## 34 VA           303
## 35 TN           322
## 36 MD           325
## 37 AL           344
## 38 IN           381
## 39 GA           392
## 40 NC           423
## 41 NJ           444
## 42 MI           446
## 43 FL           581
## 44 MO           619
## 45 MA           671
## 46 TX           962
## 47 OH          1062
## 48 IL          1084
## 49 NY          1246
## 50 PA          1451
## 51 CA          2348
#California has the most players. Oregon has 137 players. 
##QUESTION 2 

shortGuys <- People%>%
  filter(height < 60)
head(shortGuys)
##    playerID birthYear birthMonth birthDay birthCountry birthState birthCity
## 1 gaedeed01      1925          6        8          USA         IL   Chicago
## 2 healeto01      1853         NA       NA          USA         RI  Cranston
##   deathYear deathMonth deathDay deathCountry deathState deathCity nameFirst
## 1      1961          6       18          USA         IL   Chicago     Eddie
## 2      1891          2        6          USA         ME  Lewiston       Tom
##   nameLast   nameGiven weight height bats throws      debut  finalGame  retroID
## 1   Gaedel Edward Carl     65     43    R      L 1951-08-19 1951-08-19 gaede101
## 2   Healey   Thomas F.    155     55 <NA>      R 1878-06-13 1878-09-09 healt101
##     bbrefID  deathDate  birthDate
## 1 gaedeed01 1961-06-18 1925-06-08
## 2 healeto01 1891-02-06       <NA>
#there are 2 players shorter than 60 inches. Their names are Eddie Gaedel & Tom Healey.
## QUESTION 3

rightHand <- People %>%
  filter(bats == "R")
leftHand <- People %>%
  filter(bats == "L")
ambidextrous <- People %>%
  filter(bats == "B")
12437/20093 #=.6189718
## [1] 0.6189718
5247/20093 #= .2611357
## [1] 0.2611357
1229/20093 #= .06116558
## [1] 0.06116558
#61.9% of players are right handed batters, 26.11% are left handed, and 6.12% are both.
## QUESTION 4

avgHtbyThrw <- People %>% 
  group_by(throws)
  summarize(avgHtbyThrw, avgHt = mean(height, na.rm = TRUE))
## # A tibble: 4 x 2
##   throws avgHt
##   <fct>  <dbl>
## 1 L       72.6
## 2 R       72.4
## 3 S       72  
## 4 <NA>    69.3
#the average height for right handed throwers is 72.3621. For left handed throwers, it is 72.61. I got an average height for "S", I'm not sure what that is cant find help the average height for NA (players we either have no data on or who don't bat) is 69.2765. Therefore, left handed throwers are slightly taller on average
## QUESTION 5 

byCountry <- People %>%
  group_by(birthCountry)%>% 
  summarize(
    count = n(),
    height = mean(height, na.rm=TRUE)) %>%
  arrange(height)
head(byCountry)
## # A tibble: 6 x 3
##   birthCountry count height
##   <chr>        <int>  <dbl>
## 1 Portugal         1   65  
## 2 Denmark          1   67  
## 3 Finland          1   69  
## 4 Ireland         50   69.6
## 5 <NA>            61   69.6
## 6 Belize           1   70
#The country with the shortest players on average is Portugal at 65 in; the country with the tallest players on average is Indonesia at 78 in.
## QUESTION 6 

playerBMI = People %>%
  mutate(BMI = weight*703/height^2)
head(playerBMI)
##    playerID birthYear birthMonth birthDay birthCountry birthState  birthCity
## 1 aardsda01      1981         12       27          USA         CO     Denver
## 2 aaronha01      1934          2        5          USA         AL     Mobile
## 3 aaronto01      1939          8        5          USA         AL     Mobile
## 4  aasedo01      1954          9        8          USA         CA     Orange
## 5  abadan01      1972          8       25          USA         FL Palm Beach
## 6  abadfe01      1985         12       17         D.R.  La Romana  La Romana
##   deathYear deathMonth deathDay deathCountry deathState deathCity nameFirst
## 1        NA         NA       NA         <NA>       <NA>      <NA>     David
## 2      2021          1       22          USA         GA   Atlanta      Hank
## 3      1984          8       16          USA         GA   Atlanta    Tommie
## 4        NA         NA       NA         <NA>       <NA>      <NA>       Don
## 5        NA         NA       NA         <NA>       <NA>      <NA>      Andy
## 6        NA         NA       NA         <NA>       <NA>      <NA>  Fernando
##   nameLast        nameGiven weight height bats throws      debut  finalGame
## 1  Aardsma      David Allan    215     75    R      R 2004-04-06 2015-08-23
## 2    Aaron      Henry Louis    180     72    R      R 1954-04-13 1976-10-03
## 3    Aaron       Tommie Lee    190     75    R      R 1962-04-10 1971-09-26
## 4     Aase   Donald William    190     75    R      R 1977-07-26 1990-10-03
## 5     Abad    Fausto Andres    184     73    L      L 2001-09-10 2006-04-13
## 6     Abad Fernando Antonio    235     74    L      L 2010-07-28 2019-09-28
##    retroID   bbrefID  deathDate  birthDate      BMI
## 1 aardd001 aardsda01       <NA> 1981-12-27 26.87022
## 2 aaroh101 aaronha01 2021-01-22 1934-02-05 24.40972
## 3 aarot101 aaronto01 1984-08-16 1939-08-05 23.74578
## 4 aased001  aasedo01       <NA> 1954-09-08 23.74578
## 5 abada001  abadan01       <NA> 1972-08-25 24.27322
## 6 abadf001  abadfe01       <NA> 1985-12-17 30.16892
## QUESTION 7

obese <- filter(playerBMI, BMI > 30)%>%
  arrange(desc(BMI))
head(obese)
##    playerID birthYear birthMonth birthDay birthCountry      birthState
## 1  kirkal01      1998         11        6       Mexico Baja California
## 2 colonba01      1973          5       24         D.R.    Puerto Plata
## 3 sandopa01      1986          8       11    Venezuela        Carabobo
## 4 fieldpr01      1984          5        9          USA              CA
## 5  diazju03      1984          2       27         D.R.       La Romana
## 6 moronre01      1993          1        6         D.R.        Santiago
##        birthCity deathYear deathMonth deathDay deathCountry deathState
## 1        Tijuana        NA         NA       NA         <NA>       <NA>
## 2       Altamira        NA         NA       NA         <NA>       <NA>
## 3 Puerto Cabello        NA         NA       NA         <NA>       <NA>
## 4        Ontario        NA         NA       NA         <NA>       <NA>
## 5      La Romana        NA         NA       NA         <NA>       <NA>
## 6       Santiago        NA         NA       NA         <NA>       <NA>
##   deathCity nameFirst nameLast     nameGiven weight height bats throws
## 1      <NA> Alejandro     Kirk     Alejandro    265     68    R      R
## 2      <NA>   Bartolo    Colon       Bartolo    285     71    R      R
## 3      <NA>     Pablo Sandoval  Pablo Emilio    268     70    B      R
## 4      <NA>    Prince  Fielder Prince Semien    275     71    L      R
## 5      <NA>     Jumbo     Diaz   Jose Rafael    315     76    R      R
## 6      <NA>     Reyes  Moronta Reyes Armando    265     70    R      R
##        debut  finalGame  retroID   bbrefID deathDate  birthDate      BMI
## 1 2020-09-12 2020-09-26 kirka001  kirkal01      <NA> 1998-11-06 40.28871
## 2 1997-04-04 2018-09-22 colob001 colonba01      <NA> 1973-05-24 39.74509
## 3 2008-08-14 2020-09-27 sandp001 sandopa01      <NA> 1986-08-11 38.44980
## 4 2005-06-13 2016-07-18 fielp001 fieldpr01      <NA> 1984-05-09 38.35053
## 5 2014-06-20 2017-07-16 diazj005  diazju03      <NA> 1984-02-27 38.33882
## 6 2017-09-05 2019-08-31 moror001 moronre01      <NA> 1993-01-06 38.01939
#there are 551 players that fall into the obese data set. Alejandro Kirk has the highest BMI, of 40.28871.
##QUESTION 8

#BMIbyPlace <- playerBMI %>%
  #select(c("birthCountry", "birthState", "nameFirst", "nameLast", "BMI"))

#?select

#I have been trying to get this to run and trying a lot of different things, and I can't get it to run without error messages coming up. 
## QUESTION 9 

lifetime = People %>% mutate(lifeInYears = deathDate - birthDate)

deadPlayers <- lifetime%>%
  filter(lifeInYears>0)%>%
  arrange(desc(lifeInYears))
head(deadPlayers)
##    playerID birthYear birthMonth birthDay birthCountry  birthState
## 1  hoffch01      1891          5        8          USA          NY
## 2 marreco01      1911          4       25         Cuba Villa Clara
## 3 wrighbo01      1891         12       13          USA          IN
## 4 parkeac01      1912          5       17          USA          VA
## 5 malinto01      1909         10        7          USA          IL
## 6 swanska01      1900         12       17          USA          IL
##         birthCity deathYear deathMonth deathDay deathCountry deathState
## 1        Ossining      1998          9       17          USA         FL
## 2 Sagua La Grande      2014          4       23         Cuba  La Habana
## 3  Decatur County      1993          7       30          USA         CA
## 4      Portsmouth      2013         11        6          USA         VA
## 5    Collinsville      2011          2        8          USA         CA
## 6 North Henderson      2002          4        3          USA         IL
##       deathCity nameFirst  nameLast         nameGiven weight height bats throws
## 1 Daytona Beach       Red      Hoff Chester Cornelius    162     69    L      L
## 2     La Habana    Connie   Marrero   Conrado Eugenio    158     65    R      R
## 3    Carmichael       Bob    Wright    Robert Cassius    175     73    R      R
## 4    Portsmouth       Ace    Parker    Clarence McKay    180     72    R      R
## 5        Oxnard      Tony Malinosky   Anthony Francis    165     70    R      R
## 6   Rock Island      Karl   Swanson       Karl Edward    155     70    L      R
##        debut  finalGame  retroID   bbrefID  deathDate  birthDate lifeInYears
## 1 1911-09-06 1915-10-02 hoffr102  hoffre01 1998-09-17 1891-05-08  39213 days
## 2 1950-04-21 1954-09-07 marrc101 marreco01 2014-04-23 1911-04-25  37619 days
## 3 1915-09-21 1915-09-24 wrigb101 wrighbo01 1993-07-30 1891-12-13  37119 days
## 4 1937-04-24 1938-09-04 parka102 parkeac01 2013-11-06 1912-05-17  37063 days
## 5 1937-04-26 1937-07-16 malit101 malinto01 2011-02-08 1909-10-07  37014 days
## 6 1928-08-12 1929-05-05 swank101 swanska01 2002-04-03 1900-12-17  36997 days
#the player who lived the longest is Red Hoff (given name is Chester Cornelius)

39213/365 # = 107.4329 years lived
## [1] 107.4329
## QUESTION 10 

#as.Date(People$debut, "%Y-%m-%d")
#as.Date(People$finalGame, "%Y-%m-%d")

#When I'm running this, it appears to be working, but then when I look at the class of the columns, they are both still character. I've tried a lot of different combinations of things and just can't get it to actually change them. 
#PART II

#install.packages("MASS")
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
data("oats")
#?oats
##DATA STRUCTURE


##QUESTION 1: what does each row of this dataset represent?
str(oats)
## 'data.frame':    72 obs. of  4 variables:
##  $ B: Factor w/ 6 levels "I","II","III",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ V: Factor w/ 3 levels "Golden.rain",..: 3 3 3 3 1 1 1 1 2 2 ...
##  $ N: Factor w/ 4 levels "0.0cwt","0.2cwt",..: 1 2 3 4 1 2 3 4 1 2 ...
##  $ Y: int  111 130 157 174 117 114 161 141 105 140 ...
#Each row represents a plot that the farmer made, 72 rows for his 72 plots
## QUESTION 2: What do the columns of this dataset represent? Indicate whether each variable in the study is numerical or categorical. If numerical, identify as continuous or discrete. If categorical, identify as ordinal or nominal.

#1st column is "B" & represents which of the 6 blocks each plot is on.Variable "B" is categorical & nominal.

#2nd column is "V" & represents which of the 3 varieties of oats were planted. Variable "V" is categorical & nominal.

#3rd column is "N" & represents the nitrogen treatment is used for each plot. Variable is numerical & discrete.

#4th column is "Y" & represents yields in 1#/4lbs/subplot, w area 1/80 acre each. Variable is numerical & continuous. 
## QUESTION 3: What are the response and explanatory variables in this study? 

#Explanatory variables are oat variety & Nitrogen treatment in cwt. The response variable is yields in 1/4 lbs per sub-plot. The blocks variable is being used as a way to control where plots were located geographically on the farmer's land. 
## QUESTION 4: Create a hypothesis about nitrogen fertilizer concentration levels.

#My hypothesis (which, due to my limited knowledge on this topic, is not very well supported) is that the "Victory" variety of oats, with the highest nitrogen treatment level will yield the most crop.
##GRAPHICS & EDA


##QUESTION 5: Plot 1- create a side by side boxplot that illustrates the yield distribution for EACH nitrogen fertilizer concentration level and allows for both visual comparison across and within treatments.

ggplot(oats, aes(x=N, y=Y))+
  geom_boxplot()

##QUESTION 6: Look at plot 1. What are your observations from this plot? Does your hypothesis appear to be supported? Explain.

#Observations from plot -> looking at this box plot, we can see a general trend that the higher the nitrogen treatment level used, the more crop that was yielded from the corresponding field plot. This is true throughout the entire summary that is shown on the box plot, with the 0.6cwt N value having the highest minimum, Q1, median, Q3, and maximum values for Y. 

#Hypothesis supported? -> This data does support my hypothesis from q4, since the box plot illustrates a general trend of higher values for N correlatingto higher values of Y. That being said, this doesn't address data collected onthe variety of oats being planted, so we cannot tell from this plot whether it is the nitrogen level, oat variety, or both that correlates to yield size. 
##QUESTION 7:Plot 2- create a side by side box plot that illustrates the yield distribution for each oat variety treatment. Let's add some color! Fill the boxes with a different color for each variety.

ggplot(oats, aes(x=V, y=Y, color=V))+
  geom_boxplot()

## QUESTION 8: Look at plot 2. What are your observations from this plot?Do any varieties stand out as the best producer? Explain. 

#Observations from this plot -> this plot indicates that the Marvellous variety has the greatest median at about 112.5 1/4 lbs/sub-plot. Golden rain is next, with Victory having the lowest median. The minimum, Q3, and Maximum values for V are not very notably different from each other, but the Q1 values followa similar trend to the median with Marvellous having the greatest value, andVictory having the lowest value. This box plot also shows us that it appears the Victory oat variety data has an outlier point above the rest of the dataat about 175 1/4 lbs/sub-plot.
## QUESTION 9: Plot 3- add facets to your plot from part 6 to compare yields across nitrogen fertilizer concentration levels and the oat varieties.

ggplot(oats, aes(x=V, y=Y, color=V))+
  geom_boxplot()+
  facet_wrap(~N)

##QUESTION 10: Look at plot 3. What are your observations? Do the trends appear to be consistent? 

#Observations; trends consistent? -> There are quite a few trends that remain consistent throughout the 3 different plots that are illustrated in Plot 3.Plot 3 illustrates the same trend noted in Plot 1, which is that is appears that as the nitrogen level increases, so does the value for crops yielded. Also, Plot 3 shows that even when grouped by nitrogen level, Victory variety has an outlier above the rest of the dataset. This indicates that whatever is causing this outlier, is something that happened throughout each N value. Across all four of the values for N level, Victory consistently has the lowest median yield value. 

#New trends/observations -> When we separate the data out by Nitrogen level (like we do in Plot 3), it becomes clear to us that the Marvellous variety of oats does not always maintain the highest median value. At the value N=0.6cwt, Golden.rain has a higher median value for Y. In the plot for the value of N=0.4cwt, the box for the Golden.rain variety shows us that these data have a relatively wide range of values for Y; meanwhile, the range of values for the data in both the Marvellous and Victory varieties on this plot is quite narrow. I am not entirely sure why this might be, but it seems noteworthy. The Victory variety of oats has the lowest IQR value across all four N values.
#CONCLUSION


##QUESTION 11: What advice would you give the farmer exploring this data?

#In general, the trend seems to be that if you want to increase the oat yield, you should opt for fertilizers that have higher nitrogen content in them. According to the tests that we ran, the fertilizer/oat variety combo with the highest median value of yield is .6cwt nitrogen fertilizer/Golden.rain oat variety. That being said, the Marvellous oat variety produces high yields more consistently at both the .4cwt and .6cwt Nitrogen fertilizer levels. With the information gathered from this study, I would recommend using the Marvellous oat variety and the 0.6cwt Nitrogen Fertilizer level (though .4 would also do well). I would not recommend using the Victory oat Variety. I might suggest the farmer look at the correlating block numbers for each growing plot. Their knowledge of the ideal growing conditions as well as theconditions that occurred during the measured growing period might be able to help piece together some of these observations I was unsure about.