– Analysis of Penguins Dataset –

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(palmerpenguins)
library(dplyr)


data <- penguins

Exploring our data with dplyr

Main functions we’ll use

arrange() filter() select() mutate() summarise() (you can also use summarize()) Reading and writing R code One thing that I really enjoy about working in R is that I can write out what I want to do in a sentence, and then translate that into code. For example, if I say:

Take the penguins dataset and then filter for all penguins that live on Torgersen island

Take the penguins dataset translates to penguins and then translates to %>% filter for all penguins that live on Torgersen island translates to filter(island == “Torgersen”) We can then take these three lines and put them together to get the following:

data %>% 
  filter(island == "Torgersen")
## # A tibble: 52 × 8
##    species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##    <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
##  1 Adelie  Torgersen           39.1          18.7               181        3750
##  2 Adelie  Torgersen           39.5          17.4               186        3800
##  3 Adelie  Torgersen           40.3          18                 195        3250
##  4 Adelie  Torgersen           NA            NA                  NA          NA
##  5 Adelie  Torgersen           36.7          19.3               193        3450
##  6 Adelie  Torgersen           39.3          20.6               190        3650
##  7 Adelie  Torgersen           38.9          17.8               181        3625
##  8 Adelie  Torgersen           39.2          19.6               195        4675
##  9 Adelie  Torgersen           34.1          18.1               193        3475
## 10 Adelie  Torgersen           42            20.2               190        4250
## # … with 42 more rows, and 2 more variables: sex <fct>, year <int>

Applying arrange()

arrange() “arranges,” or organizes, our data in ascending order, starting from the lowest value and running to the highest (or in the case of character data, in alphabetical order).

data %>% 
  arrange(bill_length_mm) %>% 
  head()
## # A tibble: 6 × 8
##   species island bill_length_mm bill_depth_mm flipper_length_… body_mass_g sex  
##   <fct>   <fct>           <dbl>         <dbl>            <int>       <int> <fct>
## 1 Adelie  Dream            32.1          15.5              188        3050 fema…
## 2 Adelie  Dream            33.1          16.1              178        2900 fema…
## 3 Adelie  Torge…           33.5          19                190        3600 fema…
## 4 Adelie  Dream            34            17.1              185        3400 fema…
## 5 Adelie  Torge…           34.1          18.1              193        3475 <NA> 
## 6 Adelie  Torge…           34.4          18.4              184        3325 fema…
## # … with 1 more variable: year <int>
# creating a random subset of the penguins dataset
set.seed(406)

penguins_subset <- data %>%
  sample_n(12)  # another dplyr function!

penguins_subset
## # A tibble: 12 × 8
##    species   island    bill_length_mm bill_depth_mm flipper_length_… body_mass_g
##    <fct>     <fct>              <dbl>         <dbl>            <int>       <int>
##  1 Adelie    Torgersen           41.4          18.5              202        3875
##  2 Gentoo    Biscoe              45.5          13.9              210        4200
##  3 Gentoo    Biscoe              43.5          15.2              213        4650
##  4 Gentoo    Biscoe              50.5          15.9              225        5400
##  5 Gentoo    Biscoe              45.8          14.2              219        4700
##  6 Chinstrap Dream               49.3          19.9              203        4050
##  7 Adelie    Biscoe              40.5          17.9              187        3200
##  8 Chinstrap Dream               45.2          16.6              191        3250
##  9 Adelie    Dream               36.3          19.5              190        3800
## 10 Adelie    Torgersen           39            17.1              191        3050
## 11 Adelie    Biscoe              41.6          18                192        3950
## 12 Gentoo    Biscoe              48.2          15.6              221        5100
## # … with 2 more variables: sex <fct>, year <int>
penguins_subset %>%
  mutate(body_weight_pounds = body_mass_g / 453.59237)
## # A tibble: 12 × 9
##    species   island    bill_length_mm bill_depth_mm flipper_length_… body_mass_g
##    <fct>     <fct>              <dbl>         <dbl>            <int>       <int>
##  1 Adelie    Torgersen           41.4          18.5              202        3875
##  2 Gentoo    Biscoe              45.5          13.9              210        4200
##  3 Gentoo    Biscoe              43.5          15.2              213        4650
##  4 Gentoo    Biscoe              50.5          15.9              225        5400
##  5 Gentoo    Biscoe              45.8          14.2              219        4700
##  6 Chinstrap Dream               49.3          19.9              203        4050
##  7 Adelie    Biscoe              40.5          17.9              187        3200
##  8 Chinstrap Dream               45.2          16.6              191        3250
##  9 Adelie    Dream               36.3          19.5              190        3800
## 10 Adelie    Torgersen           39            17.1              191        3050
## 11 Adelie    Biscoe              41.6          18                192        3950
## 12 Gentoo    Biscoe              48.2          15.6              221        5100
## # … with 3 more variables: sex <fct>, year <int>, body_weight_pounds <dbl>
penguins_subset %>%
  summarise(avg_body_mass = mean(body_mass_g))
## # A tibble: 1 × 1
##   avg_body_mass
##           <dbl>
## 1         4102.
# summarising body mass on the entire penguins dataset while removing NA values from the calculation

penguins %>%
  summarise(avg_body_mass = mean(body_mass_g, na.rm = TRUE))
## # A tibble: 1 × 1
##   avg_body_mass
##           <dbl>
## 1         4202.
# now let's use the grouping function, group_by(), to look at the average body mass of penguins, in grams,
# by species

data %>%
  group_by(species) %>%
  summarise(avg_species_body_mass = mean(body_mass_g, na.rm = TRUE)) 
## # A tibble: 3 × 2
##   species   avg_species_body_mass
##   <fct>                     <dbl>
## 1 Adelie                    3701.
## 2 Chinstrap                 3733.
## 3 Gentoo                    5076.
# now let's calculate the average body mass by species AND island

data %>%
  group_by(species, island) %>%
  summarise(avg_species_body_mass = mean(body_mass_g, na.rm = TRUE)) 
## `summarise()` has grouped output by 'species'. You can override using the `.groups` argument.
## # A tibble: 5 × 3
## # Groups:   species [3]
##   species   island    avg_species_body_mass
##   <fct>     <fct>                     <dbl>
## 1 Adelie    Biscoe                    3710.
## 2 Adelie    Dream                     3688.
## 3 Adelie    Torgersen                 3706.
## 4 Chinstrap Dream                     3733.
## 5 Gentoo    Biscoe                    5076.