Homework 3

Do not change anything in the following chunk

You will be working on olympic_gymnasts dataset. Do not change the code below:

olympics <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-07-27/olympics.csv')

olympic_gymnasts <- olympics %>% 
  filter(!is.na(age)) %>%             # only keep athletes with known age
  filter(sport == "Gymnastics") %>%   # keep only gymnasts
  mutate(
    medalist = case_when(             # add column for success in medaling
      is.na(medal) ~ FALSE,           # NA values go to FALSE
      !is.na(medal) ~ TRUE            # non-NA values (Gold, Silver, Bronze) go to TRUE
    )
  )

More information about the dataset can be found at

https://github.com/rfordatascience/tidytuesday/blob/master/data/2021/2021-07-27/readme.md

Question 1: Create a subset dataset with the following columns only: name, sex, age, team, year and medalist. Call it df.

df<- olympic_gymnasts|>
  select(name, sex, age, team, year, medalist)
print(df)

## # A tibble: 25,528 × 6
##    name                    sex     age team     year medalist
##    <chr>                   <chr> <dbl> <chr>   <dbl> <lgl>   
##  1 Paavo Johannes Aaltonen M        28 Finland  1948 TRUE    
##  2 Paavo Johannes Aaltonen M        28 Finland  1948 TRUE    
##  3 Paavo Johannes Aaltonen M        28 Finland  1948 FALSE   
##  4 Paavo Johannes Aaltonen M        28 Finland  1948 TRUE    
##  5 Paavo Johannes Aaltonen M        28 Finland  1948 FALSE   
##  6 Paavo Johannes Aaltonen M        28 Finland  1948 FALSE   
##  7 Paavo Johannes Aaltonen M        28 Finland  1948 FALSE   
##  8 Paavo Johannes Aaltonen M        28 Finland  1948 TRUE    
##  9 Paavo Johannes Aaltonen M        32 Finland  1952 FALSE   
## 10 Paavo Johannes Aaltonen M        32 Finland  1952 TRUE    
## # ℹ 25,518 more rows

Question 2: From df create df2 that only have year of 2008 2012, and 2016

df2<- df|>
  filter(year %in% c(2008, 2012, 2016))
print(df2)

## # A tibble: 2,703 × 6
##    name              sex     age team     year medalist
##    <chr>             <chr> <dbl> <chr>   <dbl> <lgl>   
##  1 Nstor Abad Sanjun M        23 Spain    2016 FALSE   
##  2 Nstor Abad Sanjun M        23 Spain    2016 FALSE   
##  3 Nstor Abad Sanjun M        23 Spain    2016 FALSE   
##  4 Nstor Abad Sanjun M        23 Spain    2016 FALSE   
##  5 Nstor Abad Sanjun M        23 Spain    2016 FALSE   
##  6 Nstor Abad Sanjun M        23 Spain    2016 FALSE   
##  7 Katja Abel        F        25 Germany  2008 FALSE   
##  8 Katja Abel        F        25 Germany  2008 FALSE   
##  9 Katja Abel        F        25 Germany  2008 FALSE   
## 10 Katja Abel        F        25 Germany  2008 FALSE   
## # ℹ 2,693 more rows

Question 3 Group by these three years (2008,2012, and 2016) and summarize the mean of the age in each group.

age_summary<- df2|>
  group_by(year) |>
  summarise(mean_age = mean(age))
  print(age_summary)

## # A tibble: 3 × 2
##    year mean_age
##   <dbl>    <dbl>
## 1  2008     21.6
## 2  2012     21.9
## 3  2016     22.2

Question 4 Use olympic_gymnasts dataset, group by year, and find the mean of the age for each year, call this dataset oly_year. (optional after creating the dataset, find the minimum average age)

oly_year<- olympic_gymnasts|>
  group_by(year)|>
  summarise(mean_age2 = mean(age))
print(oly_year)

## # A tibble: 29 × 2
##     year mean_age2
##    <dbl>     <dbl>
##  1  1896      24.3
##  2  1900      22.2
##  3  1904      25.1
##  4  1906      24.7
##  5  1908      23.2
##  6  1912      24.2
##  7  1920      26.7
##  8  1924      27.6
##  9  1928      25.6
## 10  1932      23.9
## # ℹ 19 more rows

Question 5 This question is open ended. Create a question that requires you to use at least two verbs. Create a code that answers your question. Then below the chunk, reflect on your question choice and coding procedure Summarize the female participants across the years in the olympic team

# Your R code here
 my_question <- df|>
  group_by(year)|>
  filter(sex== "F")|>
  summarise(count = n())
print(my_question)

## # A tibble: 20 × 2
##     year count
##    <dbl> <int>
##  1  1928    35
##  2  1936    64
##  3  1948    72
##  4  1952   898
##  5  1956   418
##  6  1960   716
##  7  1964   474
##  8  1968   587
##  9  1972   704
## 10  1976   502
## 11  1980   358
## 12  1984   379
## 13  1988   522
## 14  1992   532
## 15  1996   576
## 16  2000   502
## 17  2004   507
## 18  2008   433
## 19  2012   382
## 20  2016   383

  ggplot(my_question, aes(x= year, y = count)) +
  geom_point(color = "purple")+
  geom_line(color = "pink")

  labs(
    title = "Number of Female Athletes per Year"
  )

## <ggplot2::labels> List of 1
##  $ title: chr "Number of Female Athletes per Year"

  print(my_question)

## # A tibble: 20 × 2
##     year count
##    <dbl> <int>
##  1  1928    35
##  2  1936    64
##  3  1948    72
##  4  1952   898
##  5  1956   418
##  6  1960   716
##  7  1964   474
##  8  1968   587
##  9  1972   704
## 10  1976   502
## 11  1980   358
## 12  1984   379
## 13  1988   522
## 14  1992   532
## 15  1996   576
## 16  2000   502
## 17  2004   507
## 18  2008   433
## 19  2012   382
## 20  2016   383

Discussion: Enter your discussion of results here. For this question I wanted to see a trend of the female participation in the olympic team across the years. So first I filtered the data to see only female participants and then grouped by year, and lastly summarized by count. I wanted to play with ggplot as well because I do not come from Data 101, and I am a visual learner so I am looking forward to learn more about this.