key = a set of variables used to connect a pair of dataframes
primary key = identifies individual rows its own data frame. Give me a value, I can tell the unique row that has that value.
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.2 ✓ dplyr 1.0.6
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(help = "babynames")
Ans: Applicants, Baby names, Births and Lifetables.
library(babynames)
head(applicants) #Applicants.
head(babynames) #Baby names.
head(births) # Births
head(lifetables) # Lifetables
# applicants : the keys are year and sex.
applicants %>%
group_by(year,sex) %>%
count() %>%
filter(n > 1)
# babynames: the keys are year, sex, and name.
babynames %>%
group_by(year, sex, name) %>%
count() %>%
filter(n > 1)
# babynames: the key is year.
births %>%
group_by(year) %>%
count() %>%
filter(n > 1)
# lifetables: the keys are x, sex, and year. (qx:ex are not)
lifetables %>%
group_by(x,sex,year) %>%
count() %>%
filter(n > 1)
library(help = "nasaweather")
Ans: Atmospheric data, Country borders, Elevation, Glacier locations and Storm tracks data.
library(nasaweather)
##
## Attaching package: 'nasaweather'
## The following object is masked from 'package:dplyr':
##
## storms
head(atmos) #Atmospheric_data.
# head(borders)#Country_borders
head(elev) #Elevation.
head(glaciers) #Glacier_locations
head(storms) #Storm_tracks_data
# Test note chunk
if (FALSE){
Atmospheric_data.
Country_borders
Elevation.
Glacier_locations
Storm_tracks_data
}
# atmos:the keys are lat, long, year, month
nasaweather::atmos %>%
count(lat, long, year, month) %>%
filter(n > 1) %>%
nrow()
## [1] 0
# borders:No key in this data frame
#nasaweather::borders %>%
#count(country, long, lat, group) %>%
#filter(n > 1) %>%
#nrow()
# elev:the keys are long, lat
nasaweather::elev %>%
count(long, lat) %>%
filter(n > 1) %>%
nrow()
## [1] 0
# glaciers:the key is id
nasaweather::glaciers %>%
count(id) %>%
filter(n > 1) %>%
nrow()
## [1] 0
# storms:the key is name,year, month, day, hour, lat or
# name,year, month, day, hour, lat and long
# It should have been the listed variables Plus One other variable to differentiate, so it could have been lat, or long or wind, or type but not pressure or seasday. Adding two is not necessary.
nasaweather::storms %>%
count(name,year, month, day, hour, lat) %>%
filter(n > 1) %>%
nrow()
## [1] 0
library(tidyverse)
library(Lahman)
help("Lahman-package")
head(Master)
head(Batting)
head(Pitching)
head(Fielding)
head(Teams)
head(Salaries)
• Show the first ten names (arranged in alphabetical order of last name).
• Note the World Series was not played each year and began in 1903.
Teams %>%
filter(yearID >= 1903 ) %>%
filter(LgWin == "Y" & teamID == "BOS") %>%
filter(!is.na(WSWin)) %>% # WSWin should be Y or No
select(yearID,lgID,teamID,LgWin,WSWin) -> Boston_League
left_join( Boston_League,Fielding,by = c("yearID", "lgID", "teamID")) %>%
left_join(Master, by = "playerID" ) %>%
filter(stint > 0) %>%
select(nameFirst, nameLast, yearID) %>%
arrange(nameLast) %>%
distinct() %>% #unique() also
head(10)
• Construct a data frame containing the total salary for each player for each year.
• Construct a second data frame containing the total number of at bats and hits for each player in a year.
Lahman::Salaries %>%
group_by(yearID,playerID) %>%
summarise(salary_total = sum(salary, na.rm = TRUE)) -> salary_total
## `summarise()` has grouped output by 'yearID'. You can override using the `.groups` argument.
Lahman::Batting %>%
group_by(yearID,playerID) %>%
summarise(total_bats = sum(AB, na.rm = TRUE),
total_hits = sum(H, na.rm = TRUE)) -> total_bats_and_hits
## `summarise()` has grouped output by 'yearID'. You can override using the `.groups` argument.
full_join(salary_total, total_bats_and_hits, by = c("yearID", "playerID")) ->
multiple_teams_each_year
multiple_teams_each_year
• Using the data frames you created in part 3, create a new data frame with batting average and salary information for only players in the years after 1985 (when salary information started being collected) who had a minimum of 400 at bats.
after_1985 <- multiple_teams_each_year %>%
mutate(batting_average = (total_hits / total_bats)) %>%
filter(yearID > 1985 & total_bats >= 400) %>% #including 1985
select(yearID, playerID, batting_average, salary_total)
after_1985
• Explore the marginal association between a player’s batting average and their salary.
after_1985 %>%
ggplot(aes(x = batting_average, y = salary_total))+
geom_point()+
scale_x_log10()+
scale_y_log10()+
ggtitle("Association between a player’s batting average and their salary") +
geom_smooth(method = "lm", se = FALSE )+
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 665 rows containing non-finite values (stat_smooth).
## Warning: Removed 665 rows containing missing values (geom_point).
• Explore if this association has changed over time (for example, because sports teams are getting more stats-savvy). Hint: figure out how to set the color based on year.
after_1985 %>%
ggplot(aes(x = batting_average, y = salary_total, color = as.factor(yearID)))+
scale_y_log10()+
ggtitle("Association between a player batting average and their salary") +
geom_smooth(method = "lm", se = FALSE,linetype = "dashed")+
theme_bw()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 665 rows containing non-finite values (stat_smooth).
Conclusion: The relationship between the batting average and the total salary is positive. On the other hand, when the batting average is increasing, the total salary is increasing, too. (the year of legend is not affected the batting average and total of salary.) Therefore, what happens to totally salary, while batting average is increasing over time.
Lahman::Master %>%
filter(nameFirst == "John") -> John_bio
left_join(John_bio, Salaries, by = "playerID") %>%
select(yearID,nameFirst,nameLast,salary) %>%
arrange(desc(salary)) %>%
filter(yearID %%2 == 0) %>%
head(10)