tidyverse package.library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.1.8
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
read.csv() - load a csv file (aka comma-separated-values file format)
filter() - keep rows that satisfy your condition.
select() - keep or exclude some columns.
rename() - rename columns.
relocate() - move columns.
mutate() - add a new column as a result of data manipulation.
group_by() + summarize() - get summary statistics by group.
count() - quickly find counts for different groups.
case_when() - friendly ifelse()
ifelse() - a condition clause
# load the file from cloud link
read.csv("https://raw.githubusercontent.com/jossalene/CodingCult/master/penguins_size%20(2).csv")
# load the file from File tab
read.csv("penguins_size (2).csv")
penguins <- read.csv("https://raw.githubusercontent.com/jossalene/CodingCult/master/penguins_size%20(2).csv")
head(penguins)
dim(penguins)
str(penguins)
Character/string variables
unique(penguins$species)
unique(penguins$island)
unique(penguins$sex) # some values of NA and "."
Numeric/integer variables
min(penguins$body_mass_g, na.rm = TRUE)
max(penguins$body_mass_g, na.rm = TRUE)
mean(penguins$body_mass_g, na.rm = TRUE)
round( mean(penguins$body_mass_g, na.rm = TRUE) , 1)
summary(penguins$body_mass_g)
# check the first four rows of the penguins dataset
penguins %>% head(n=4)
# check for the structure of the penguins
penguins %>% str()
# list out the column names
penguins %>% colnames() # dplyr method
colnames(penguins) # base R method
rename( new-name = old-name )
penguins <- penguins %>% rename(Species = species,
Island = island,
beak.len = culmen_length_mm,
beak.dep = culmen_depth_mm,
flipper = flipper_length_mm,
mass = body_mass_g)
# Species, Island
penguins <- penguins %>%
mutate(Species = factor(Species) ,
Island = factor(Island) )
# count number of penguins in each species
penguins_count <- penguins %>% count(Species)
penguins_count$n
# count penguins by species, island and sex
penguins %>% count(Species, Island, sex)
Condition symbols:
== : is / Match with
!= : is NOT / NOT Match with
> : greater than
< : less than
>= : greater than and equal to
<= : less than and equal to
& : AND - BOTH conditions is needed
| : OR - EITHER ONE condition is needed
# Chinstrap species ONLY
penguins %>% filter(Species == "Chinstrap")
# Chinstrap OR Gentoo
penguins %>% filter(Species == "Chinstrap" | Species == "Gentoo")
penguins %>% filter(Species != "Adelie")
# Gentoo from Dream island
penguins %>% filter(Species == "Gentoo" & Island == "Dream")
# Adelie from Dream island
penguins %>% filter(Species == "Adelie" & Island == "Dream")
# penguins with flipper length is > 229 mm
penguins %>% filter(flipper > 229)
# keep column Species and mass
penguins %>%
select(Species, mass)
# keep columns from species to beak.dep (the first 4 columns)
penguins %>%
select(Species:beak.dep)
# keep columns from species to beak.dep AND sex (the first 4 columns + last column)
penguins %>%
select(Species:beak.dep, sex)
penguins %>%
select(-c(flipper, mass))
# keep all except the island
penguins %>%
select(-Island)
# EXCLUDE columns from species to beak.dep (the first 4 columns)
penguins %>%
select( ! (Species:beak.dep) )
# keep columns from species to beak.dep AND sex (the first 4 columns + last column)
penguins %>%
select(! c(Species:beak.dep , sex) )
# select column that start with "beak"
penguins %>%
select(starts_with("beak"))
# select island, mass for Gentoo species
penguins %>%
filter(Species == "Gentoo") %>%
select(Island, mass)
penguins %>%
select(Species, Island, mass) %>%
filter(Species == "Gentoo")
# male penguins with flippers longer than 228 mm
penguins %>%
filter(sex == "MALE" & flipper > 228)
# male penguins with flippers longer than 228 mm - only looking beak.len and beak.dep
penguins %>%
filter(sex == "MALE" & flipper > 228) %>%
select(contains("beak"))
# move sex to the first column
penguins %>%
relocate(sex)
# move flipper after Species
penguins %>%
relocate(flipper, .after = Species)
# move mass before beak.len
penguins %>%
relocate(mass, .before = beak.len)
mutate( C = A + B )
# convert mass in g to kg: 1kg = 1000g
penguins %>%
mutate(mass = mass/1000)
penguins %>%
mutate(mass.kg = mass/1000)
# multiple mutations
penguins %>%
mutate(beak.ratio = round( beak.len/beak.dep , 2) ,
flipper = flipper/1000)
# calculate the mean/sd body mass for EACH species
penguins %>%
group_by(Species) %>%
summarise(mass_mean = round( mean(mass , na.rm = TRUE ) , 0) ,
mass_sd = round( sd(mass , na.rm = TRUE ) , 0) )
# calculate the mean/sd body flipper for female Adelie of different island
penguins %>%
filter(Species == "Adelie" & sex == "FEMALE") %>%
group_by(Island) %>%
summarise(flipper_mean = round(mean(flipper, na.rm = TRUE), 2),
flipper_sd = round(sd(flipper, na.rm = TRUE), 2) )
# new column "size" for
# large = mass > 4500 g
# medium = 3000 > mass >= 4500 g
# small = mass <= 3000 g
penguins %>%
mutate(size = case_when(
mass > 4500 ~ "large",
mass > 3000 & mass <= 4500 ~ "medium",
mass <= 3000 ~ "small"
))
# ifelse()
penguins %>%
mutate(size = ifelse(mass > 4500, "large",
ifelse(mass > 3000 & mass <= 4500 , "medium", "small")))
To generate plots, we will employ ggplot2 package which is already stored within the superpackage tidyverse.
ggplot( data = data-set-name, mapping = aes( x = , y = )) + geom_graph-type ( ) + labs( x = ” Horizontal Line Label ” , y = ” Vertical Line Label ” , title = ” Plot Title ” )
###1. Bar graph of Number of Penguins in Different Sex.
x = Group variable = Sex y = Measured variable = Count
ggplot(data = penguins , mapping = aes(x = sex)) +
geom_bar(color = "#0077b6",
fill = "#90e0ef") +
labs(x = "Penguins Sex",
y = "Count",
title = "Number of Penguins of Different Sex")
# using the pipe %>% and | (or)
penguins %>%
filter(sex == "FEMALE" | sex == "MALE") %>%
ggplot(mapping = aes(x = sex)) +
geom_bar(color = "#0077b6",
fill = "#90e0ef",
width = 0.3) +
geom_text(stat = "count",
mapping = aes(label= ..count..),
vjust = 8) +
labs(x = "Penguins Sex",
y = "Count",
title = "Number of Penguins of Different Sex")
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
###2. Bar graph of Different Penguins Species from Different Island.
penguins %>%
ggplot(mapping = aes(x = Island, fill = Species)) +
geom_bar(width = 0.3) +
geom_text(stat = "count",
mapping = aes(label = ..count.. ),
vjust = -1) +
labs(x = "Species",
y = "Count",
title = "Bar graph of Different Penguins Species from Different Islands") +
theme_classic()
penguins %>%
ggplot(aes(x = mass)) +
geom_histogram(fill = "#74c69d", color = "#f18701") +
labs(x = "Body Mass (g)",
y = "Number of Penguins",
title = "Histogram of Penguins' Body Mass") +
theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite values (`stat_bin()`).
penguins %>%
ggplot(mapping = aes(x = flipper , fill = Species)) +
geom_histogram(alpha = 0.5,
position = "identity",
color = "black") +
scale_fill_manual(values = c( "darkorange", "purple", "cyan4") ) +
labs(x = "Flipper Length (mm)",
y = "Number of Penguins",
title = "Histogram of Penguins Flipper Length from Different Penguins Species") +
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing non-finite values (`stat_bin()`).
penguins %>%
filter( ! (sex %in% c(".") | is.na(sex) ) ) %>%
ggplot(mapping = aes(x = beak.len , y = beak.dep)) +
geom_point(aes(color = sex)) +
theme_minimal() +
labs(title = "The relationship between Penguins Beak Depth and Length",
x = "Penguins' Beak Length (mm)",
y = "Penguins' Beak Depth (mm)")
# facet wrap
penguins %>%
filter( ! (sex %in% c(".") | is.na(sex) ) ) %>%
ggplot(mapping = aes(x = beak.len , y = beak.dep)) +
geom_point(aes(color = sex), show.legend = F) +
theme_minimal() +
labs(title = "The relationship between Penguins Beak Depth and Length",
x = "Penguins' Beak Length (mm)",
y = "Penguins' Beak Depth (mm)") +
facet_wrap(~sex)
# facet grid
penguins %>%
filter( ! (sex %in% c(".") | is.na(sex) ) ) %>%
ggplot(mapping = aes(x = beak.len , y = beak.dep)) +
geom_point(aes(color = sex)) +
theme_minimal() +
labs(title = "The relationship between Penguins Beak Depth and Length",
x = "Penguins' Beak Length (mm)",
y = "Penguins' Beak Depth (mm)") +
facet_grid(~Species)
penguins %>%
ggplot(mapping = aes(x = Species, y = flipper)) +
geom_boxplot( aes(color = Species) ,
width = 0.3 ,
show.legend = F) +
geom_jitter( aes(x = Species ,
y = flipper ,
color = Species) ,
alpha = 0.5 ,
position = position_jitter(width = 0.1),
show.legend = F) +
scale_color_manual(values = c("darkorange" , "purple" , "cyan4")) +
labs(x = "Penguins Species",
y = "Flipper Length (mm)",
title = "Boxplot of Flipper Length for Different Species") +
theme_classic()
## Warning: Removed 2 rows containing non-finite values (`stat_boxplot()`).
## Warning: Removed 2 rows containing missing values (`geom_point()`).