Part 1: The tidyverse package.

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.1.8
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors

read.csv() - load a csv file (aka comma-separated-values file format)

filter() - keep rows that satisfy your condition.
select() - keep or exclude some columns.
rename() - rename columns.
relocate() - move columns.
mutate() - add a new column as a result of data manipulation.
group_by() + summarize() - get summary statistics by group.
count() - quickly find counts for different groups.
case_when() - friendly ifelse()
ifelse() - a condition clause

Part 2: The data set.

Load csv data set

# load the file from cloud link
read.csv("https://raw.githubusercontent.com/jossalene/CodingCult/master/penguins_size%20(2).csv")

# load the file from File tab
read.csv("penguins_size (2).csv")

Name you csv data set for your convenient use.

penguins <- read.csv("https://raw.githubusercontent.com/jossalene/CodingCult/master/penguins_size%20(2).csv")

head(penguins)
dim(penguins)
str(penguins)

Explore the variables

Character/string variables

unique(penguins$species)
unique(penguins$island)
unique(penguins$sex) # some values of NA and "."

Numeric/integer variables

min(penguins$body_mass_g, na.rm = TRUE)
max(penguins$body_mass_g, na.rm = TRUE)
mean(penguins$body_mass_g, na.rm = TRUE)
round( mean(penguins$body_mass_g, na.rm = TRUE) , 1)
summary(penguins$body_mass_g)

The pipe character %>% - percent larger percent – meaning AND THEN

# check the first four rows of the penguins dataset
penguins %>% head(n=4)

# check for the structure of the penguins
penguins %>%  str()

# list out the column names
penguins %>% colnames()   # dplyr method  
colnames(penguins)        # base R method

Rename variable

rename( new-name = old-name )
penguins <- penguins %>% rename(Species = species,
                    Island = island, 
                    beak.len = culmen_length_mm,
                    beak.dep = culmen_depth_mm,
                    flipper = flipper_length_mm,
                    mass = body_mass_g)

Factoring

# Species, Island
penguins <- penguins %>% 
  mutate(Species = factor(Species) ,
         Island = factor(Island) )

Counting

# count number of penguins in each species
penguins_count <-  penguins %>% count(Species)
penguins_count$n

# count penguins by species, island and sex
penguins %>% count(Species, Island, sex)

Filtering

Condition symbols:

== : is / Match with
!= : is NOT / NOT Match with
> : greater than
< : less than
>= : greater than and equal to
<= : less than and equal to
& : AND - BOTH conditions is needed
| : OR - EITHER ONE condition is needed

# Chinstrap species ONLY
penguins %>% filter(Species == "Chinstrap")

# Chinstrap OR Gentoo
penguins %>% filter(Species == "Chinstrap"  |  Species == "Gentoo")

penguins %>% filter(Species != "Adelie")

# Gentoo from Dream island
penguins %>% filter(Species == "Gentoo" & Island == "Dream")

# Adelie from Dream island
penguins %>% filter(Species == "Adelie" & Island == "Dream")

# penguins with flipper length is > 229 mm
penguins %>% filter(flipper > 229)

select() - pick which column to keep or exclude.

# keep column Species and mass
penguins %>% 
  select(Species, mass)

# keep columns from species to beak.dep (the first 4 columns)
penguins %>% 
  select(Species:beak.dep)

# keep columns from species to beak.dep AND sex (the first 4 columns + last column)
penguins %>% 
  select(Species:beak.dep, sex)

penguins %>% 
  select(-c(flipper, mass))

# keep all except the island
penguins %>% 
  select(-Island)

# EXCLUDE columns from species to beak.dep (the first 4 columns)
penguins %>% 
  select( ! (Species:beak.dep) )

# keep columns from species to beak.dep AND sex (the first 4 columns + last column)
penguins %>% 
  select(! c(Species:beak.dep , sex) )

select() with helpers: starts_with() , ends_with(), contains()

# select column that start with "beak"
penguins %>% 
  select(starts_with("beak"))

select() and filter()

# select island, mass for Gentoo species
penguins %>% 
  filter(Species == "Gentoo") %>% 
  select(Island, mass)

penguins %>% 
  select(Species, Island, mass) %>% 
  filter(Species == "Gentoo")

# male penguins with flippers longer than 228 mm 
penguins %>% 
  filter(sex == "MALE" & flipper > 228)

# male penguins with flippers longer than 228 mm - only looking beak.len and beak.dep
penguins %>% 
  filter(sex == "MALE" & flipper > 228) %>% 
  select(contains("beak"))

relocate()

# move sex to the first column
penguins %>% 
  relocate(sex)

# move flipper after Species
penguins %>% 
  relocate(flipper, .after = Species)

# move mass before beak.len
penguins %>% 
  relocate(mass, .before = beak.len)

mutate()

  mutate( C = A + B )
  
# convert mass in g to kg: 1kg = 1000g
penguins %>% 
  mutate(mass = mass/1000) 

penguins %>% 
  mutate(mass.kg = mass/1000)

# multiple mutations
penguins %>% 
  mutate(beak.ratio = round( beak.len/beak.dep , 2) ,
         flipper = flipper/1000)

group_by() and summarize() - find summary statistics for different groups and generate a nice table

# calculate the mean/sd body mass for EACH species
penguins %>% 
  group_by(Species) %>% 
  summarise(mass_mean = round( mean(mass , na.rm = TRUE ) , 0) ,
            mass_sd = round( sd(mass , na.rm = TRUE ) , 0) ) 

# calculate the mean/sd body flipper for female Adelie of different island 
penguins %>% 
  filter(Species == "Adelie" & sex == "FEMALE") %>% 
  group_by(Island) %>% 
  summarise(flipper_mean = round(mean(flipper, na.rm = TRUE), 2),
            flipper_sd = round(sd(flipper, na.rm = TRUE), 2) )

mutate() and case_when() ——- is very similar to mutate() and ifelse()

# new column "size" for
# large = mass > 4500 g
# medium = 3000 > mass >= 4500 g
# small = mass <= 3000 g

penguins %>% 
  mutate(size = case_when(
    mass > 4500 ~ "large",
    mass > 3000 & mass <= 4500 ~ "medium",
    mass <= 3000 ~ "small"
  ))

# ifelse()
penguins %>% 
  mutate(size = ifelse(mass > 4500, "large", 
                       ifelse(mass > 3000 & mass <= 4500 , "medium", "small")))

Visualization

To generate plots, we will employ ggplot2 package which is already stored within the superpackage tidyverse.

ggplot( data = data-set-name, mapping = aes( x = , y = )) + geom_graph-type ( ) + labs( x = ” Horizontal Line Label ” , y = ” Vertical Line Label ” , title = ” Plot Title ” )

Bar Graph - Comparison between groups

###1. Bar graph of Number of Penguins in Different Sex.

x = Group variable = Sex y = Measured variable = Count

ggplot(data = penguins , mapping = aes(x = sex)) +
  geom_bar(color = "#0077b6",
           fill = "#90e0ef") +
  labs(x = "Penguins Sex",
       y = "Count",
       title = "Number of Penguins of Different Sex")

# using the pipe %>% and | (or)
penguins %>% 
  filter(sex == "FEMALE" | sex == "MALE") %>% 
  ggplot(mapping = aes(x = sex)) +
  geom_bar(color = "#0077b6",
           fill = "#90e0ef",
           width = 0.3) +
  geom_text(stat = "count",
            mapping = aes(label= ..count..),
            vjust = 8) +
  labs(x = "Penguins Sex",
       y = "Count",
       title = "Number of Penguins of Different Sex")
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.

###2. Bar graph of Different Penguins Species from Different Island.

penguins %>% 
  ggplot(mapping = aes(x = Island, fill = Species)) +
  geom_bar(width = 0.3) + 
  geom_text(stat = "count",
            mapping = aes(label = ..count.. ),
            vjust = -1) +
  labs(x = "Species",
       y = "Count",
       title = "Bar graph of Different Penguins Species from Different Islands") + 
    theme_classic()

Histogram - emphasizing the spread/variability of a numeric variable

1. Histogram of Penguins Body Mass

penguins %>% 
  ggplot(aes(x = mass)) +
  geom_histogram(fill = "#74c69d", color = "#f18701") +
  labs(x = "Body Mass (g)",
       y = "Number of Penguins",
       title = "Histogram of Penguins' Body Mass") +
  theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite values (`stat_bin()`).

2. Histogram of Penguins Flipper Length from Different Penguins Species

penguins %>% 
  ggplot(mapping = aes(x = flipper , fill = Species)) +
  geom_histogram(alpha = 0.5,
                 position = "identity", 
                 color = "black") +
  scale_fill_manual(values = c( "darkorange", "purple", "cyan4") ) +
  labs(x = "Flipper Length (mm)",
       y = "Number of Penguins",
       title = "Histogram of Penguins Flipper Length from Different Penguins Species") +
  theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite values (`stat_bin()`).

Scatterplots - the relationship/correlation between 2 numeric variables

The relationship between Penguins Beak Depth and Length.

penguins %>% 
  filter( !  (sex %in% c(".") | is.na(sex) )  ) %>% 
  ggplot(mapping = aes(x = beak.len , y = beak.dep)) +
  geom_point(aes(color = sex)) +
  theme_minimal() +
  labs(title = "The relationship between Penguins Beak Depth and Length",
       x = "Penguins' Beak Length (mm)",
       y = "Penguins' Beak Depth (mm)") 

# facet wrap

penguins %>% 
  filter( !  (sex %in% c(".") | is.na(sex) )  ) %>% 
  ggplot(mapping = aes(x = beak.len , y = beak.dep)) +
  geom_point(aes(color = sex), show.legend = F) +
  theme_minimal() +
  labs(title = "The relationship between Penguins Beak Depth and Length",
       x = "Penguins' Beak Length (mm)",
       y = "Penguins' Beak Depth (mm)")  +
  facet_wrap(~sex)

# facet grid

penguins %>% 
  filter( !  (sex %in% c(".") | is.na(sex) )  ) %>% 
  ggplot(mapping = aes(x = beak.len , y = beak.dep)) +
  geom_point(aes(color = sex)) +
  theme_minimal() +
  labs(title = "The relationship between Penguins Beak Depth and Length",
       x = "Penguins' Beak Length (mm)",
       y = "Penguins' Beak Depth (mm)")  +
  facet_grid(~Species)

Boxplot - emphasizing the spread/variability of a numeric variable AND specify the quartiles

Boxplot of Flipper Length for Different Species

penguins %>% 
  ggplot(mapping = aes(x = Species, y = flipper)) +
  geom_boxplot( aes(color = Species) ,
                width = 0.3 ,
                show.legend = F) +
  geom_jitter( aes(x = Species , 
                   y = flipper , 
                   color = Species) ,
               alpha = 0.5 ,
               position = position_jitter(width = 0.1),
               show.legend = F) +
  scale_color_manual(values = c("darkorange" , "purple" , "cyan4")) +
  labs(x = "Penguins Species",
       y = "Flipper Length (mm)",
       title = "Boxplot of Flipper Length for Different Species") +
  theme_classic()
## Warning: Removed 2 rows containing non-finite values (`stat_boxplot()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).