library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(palmerpenguins)
##
## Attaching package: 'palmerpenguins'
## The following objects are masked from 'package:datasets':
##
## penguins, penguins_raw
options(warn = -1)
The Palmer Penguins dataset provides size measurements for three penguin species observed on three islands in the Palmer Archipelago, Antarctica. It includes the following variables:
species: Penguin species (Adelie, Chinstrap, Gentoo) island: Island where the penguin was observed (Biscoe, Dream, Torgersen) bill_length_mm: Bill length (mm) bill_depth_mm: Bill depth (mm) flipper_length_mm: Flipper length (mm) body_mass_g: Body mass (g) sex: Sex (male, female) year: Year of observation
print(penguins)
## # A tibble: 344 Ă— 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen NA NA NA NA
## 5 Adelie Torgersen 36.7 19.3 193 3450
## 6 Adelie Torgersen 39.3 20.6 190 3650
## 7 Adelie Torgersen 38.9 17.8 181 3625
## 8 Adelie Torgersen 39.2 19.6 195 4675
## 9 Adelie Torgersen 34.1 18.1 193 3475
## 10 Adelie Torgersen 42 20.2 190 4250
## # ℹ 334 more rows
## # ℹ 2 more variables: sex <fct>, year <int>
missing_value_per_col <- colSums(is.na(penguins))
print(missing_value_per_col)
## species island bill_length_mm bill_depth_mm
## 0 0 2 2
## flipper_length_mm body_mass_g sex year
## 2 2 11 0
# Drop missing values
penguins_clean <- na.omit(penguins)
penguins %>% count(species)
## # A tibble: 3 Ă— 2
## species n
## <fct> <int>
## 1 Adelie 152
## 2 Chinstrap 68
## 3 Gentoo 124
ggplot(penguins,aes(island)) +
geom_bar(fill = "blue") +
geom_text(stat ="count",aes(label = ..count..),vjust = -0.5) +
labs(title = "Island Distribution")
ggplot(penguins_clean,aes(bill_length_mm)) +
geom_histogram(bins = 50)
ggplot(penguins_clean,aes(species,flipper_length_mm))+
geom_boxplot(colour = "#3366FF",outlier.colour = "red",outlier.shape = 1)
ggplot(penguins_clean, aes(sex)) +
geom_bar(stat = "count", fill = "skyblue") +
geom_text(stat = "count", aes(label = ..count..), vjust = -0.5) +
labs(title = "Sex distribution of Penguins")
ggplot(penguins_clean,aes(body_mass_g)) + geom_density(fill = "cyan")
ggplot(penguins_clean,aes(bill_length_mm,bill_depth_mm))+
geom_point()
## Color by Species: Enhance the scatter plot by coloring points by species.
ggplot(penguins_clean,aes(bill_length_mm,bill_depth_mm,shape = species, color = species))+
geom_point()
ggplot(penguins,aes(year)) +
geom_bar(stat = "count", fill = "blue") +
geom_text(stat = "count", aes(label = ..count..), vjust = -0.5) +
labs(title = "No. of observations per year.")
# Sex ratio for Adelie.
male_Adelie <- penguins_clean %>% filter(sex %in% "male" & species %in% "Adelie") %>% count()
female_Adelie <- penguins_clean %>% filter(sex %in% "female" & species %in% "Adelie") %>% count()
sex_ratio_Adelie <- male_Adelie / female_Adelie
# Sex ratio for Chinstrap.
male_Chinstrap <- penguins_clean %>% filter(sex %in% "male" & species %in% "Chinstrap") %>% count()
female_Chinstrap <- penguins_clean %>% filter(sex %in% "female" & species %in% "Chinstrap") %>% count()
sex_ratio_Chinstrap <- male_Chinstrap/female_Chinstrap
# Sex ratio for Gentoo.
male_Gentoo <- penguins_clean %>% filter(sex %in% "male" & species %in% "Gentoo") %>% count()
female_Gentoo <- penguins_clean %>% filter(sex %in% "female" & species %in% "Gentoo") %>% count()
sex_ratio_Gentoo <- male_Gentoo/female_Gentoo
sex_ratio <- data.frame(species = c("Adelie", "Chinstrap", "Gentoo"),
sex_ratio = c(as.numeric(sex_ratio_Adelie),
as.numeric(sex_ratio_Chinstrap),
as.numeric(sex_ratio_Gentoo)))
ggplot(sex_ratio,aes(species,sex_ratio)) + geom_bar(stat = "identity") +
geom_text(aes(label = sex_ratio),vjust=-0.5) +
labs(title = "Sex Ratio of each Species")
library(tidyr)
sex_ratio1 <- penguins_clean %>% count(species,sex) %>%
pivot_wider(names_from = sex,values_from = n) %>% mutate(sex_ratio1 = male / female) %>%
select(species,sex_ratio1)
print(sex_ratio1)
## # A tibble: 3 Ă— 2
## species sex_ratio1
## <fct> <dbl>
## 1 Adelie 1
## 2 Chinstrap 1
## 3 Gentoo 1.05
ggplot(sex_ratio1,aes(species,sex_ratio1)) + geom_bar(stat = "identity",fill="blue") +
geom_text(aes(label = sex_ratio1),vjust=-0.5) +
labs(title = "Sex Ratio of each Species")
ggplot(penguins_clean,aes(bill_length_mm,flipper_length_mm)) +
geom_point() + geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'
ggplot(penguins_clean,aes(sex, body_mass_g)) +
geom_boxplot(colour = "#3366FF",outlier.colour = "red",outlier.shape = 1)
heatmap_data <- penguins_clean %>% count(species,island)
ggplot(heatmap_data,aes(island, species,fill = n)) + geom_tile(color = "red") +
scale_fill_gradient(low = "lightblue", high = "steelblue") +
labs(title = "Island-Species Heatmap",
x = "Island", y = "Species", fill = "Count")
library(ggcorrplot)
corr_mat <- round(cor(penguins_clean %>% select(where(is.numeric))),1)
ggcorrplot(corr_mat,method = "square")
#Base R : penguins_clean[penguins_clean$flipper_length_mm > 200,]
#dplyr
penguins_clean %>% filter(flipper_length_mm > 200)
## # A tibble: 144 Ă— 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Dream 35.7 18 202 3550
## 2 Adelie Dream 41.1 18.1 205 4300
## 3 Adelie Dream 40.8 18.9 208 4300
## 4 Adelie Biscoe 41 20 203 4725
## 5 Adelie Torgersen 41.4 18.5 202 3875
## 6 Adelie Torgersen 44.1 18 210 4000
## 7 Adelie Dream 41.5 18.5 201 4000
## 8 Gentoo Biscoe 46.1 13.2 211 4500
## 9 Gentoo Biscoe 50 16.3 230 5700
## 10 Gentoo Biscoe 48.7 14.1 210 4450
## # ℹ 134 more rows
## # ℹ 2 more variables: sex <fct>, year <int>
penguins_clean %>% group_by(species) %>%
summarise_at(vars(bill_length_mm),list(Mean = mean, Std = sd))
## # A tibble: 3 Ă— 3
## species Mean Std
## <fct> <dbl> <dbl>
## 1 Adelie 38.8 2.66
## 2 Chinstrap 48.8 3.34
## 3 Gentoo 47.6 3.11
penguins_clean %>% group_by(species) %>% slice_max(body_mass_g,n=5) %>%
arrange(species,desc(body_mass_g)) %>% select(species,body_mass_g)
## # A tibble: 16 Ă— 2
## # Groups: species [3]
## species body_mass_g
## <fct> <int>
## 1 Adelie 4775
## 2 Adelie 4725
## 3 Adelie 4700
## 4 Adelie 4675
## 5 Adelie 4650
## 6 Chinstrap 4800
## 7 Chinstrap 4550
## 8 Chinstrap 4500
## 9 Chinstrap 4450
## 10 Chinstrap 4400
## 11 Gentoo 6300
## 12 Gentoo 6050
## 13 Gentoo 6000
## 14 Gentoo 6000
## 15 Gentoo 5950
## 16 Gentoo 5950
avg_flipper <- penguins_clean %>% group_by(year) %>%
summarise_at(vars(flipper_length_mm),list(Average_flipper_length = mean))
ggplot(avg_flipper,aes(x = year,y = Average_flipper_length)) +
geom_line(color = "darkblue", size = 1.2) + geom_point(color = "black", size = 3)
penguins_clean %>% group_by(island) %>% summarise_at(vars(body_mass_g),list(Average_body_mass = mean))
## # A tibble: 3 Ă— 2
## island Average_body_mass
## <fct> <dbl>
## 1 Biscoe 4719.
## 2 Dream 3719.
## 3 Torgersen 3709.
(colSums(is.na(penguins)) / nrow(penguins)) * 100
## species island bill_length_mm bill_depth_mm
## 0.0000000 0.0000000 0.5813953 0.5813953
## flipper_length_mm body_mass_g sex year
## 0.5813953 0.5813953 3.1976744 0.0000000
penguins_clean %>% group_by(species) %>% mutate(Avg_mean = mean(bill_depth_mm)) %>%
filter(bill_depth_mm > Avg_mean) %>% select(species,bill_depth_mm,Avg_mean)
## # A tibble: 171 Ă— 3
## # Groups: species [3]
## species bill_depth_mm Avg_mean
## <fct> <dbl> <dbl>
## 1 Adelie 18.7 18.3
## 2 Adelie 19.3 18.3
## 3 Adelie 20.6 18.3
## 4 Adelie 19.6 18.3
## 5 Adelie 21.2 18.3
## 6 Adelie 21.1 18.3
## 7 Adelie 19 18.3
## 8 Adelie 20.7 18.3
## 9 Adelie 18.4 18.3
## 10 Adelie 21.5 18.3
## # ℹ 161 more rows
penguins_clean %>%
group_by(species) %>%
mutate(Avg_body_mass = mean(body_mass_g),
body_mass_level = if_else(body_mass_g > Avg_body_mass,"Above","Below")) %>%
select(species, body_mass_g, Avg_body_mass, body_mass_level)
## # A tibble: 333 Ă— 4
## # Groups: species [3]
## species body_mass_g Avg_body_mass body_mass_level
## <fct> <int> <dbl> <chr>
## 1 Adelie 3750 3706. Above
## 2 Adelie 3800 3706. Above
## 3 Adelie 3250 3706. Below
## 4 Adelie 3450 3706. Below
## 5 Adelie 3650 3706. Below
## 6 Adelie 3625 3706. Below
## 7 Adelie 4675 3706. Above
## 8 Adelie 3200 3706. Below
## 9 Adelie 3800 3706. Above
## 10 Adelie 4400 3706. Above
## # ℹ 323 more rows
penguins_clean %>% group_by(species) %>%
mutate(Rank =dense_rank(desc(flipper_length_mm))) %>%
select(species,flipper_length_mm,Rank) %>%
arrange(species, Rank)
## # A tibble: 333 Ă— 3
## # Groups: species [3]
## species flipper_length_mm Rank
## <fct> <int> <int>
## 1 Adelie 210 1
## 2 Adelie 208 2
## 3 Adelie 205 3
## 4 Adelie 203 4
## 5 Adelie 202 5
## 6 Adelie 202 5
## 7 Adelie 201 6
## 8 Adelie 200 7
## 9 Adelie 200 7
## 10 Adelie 199 8
## # ℹ 323 more rows
penguins_clean %>% count(species) %>%
mutate(Proportion = round((n / nrow(penguins_clean) * 100),2)) %>%
select(species,Proportion)
## # A tibble: 3 Ă— 2
## species Proportion
## <fct> <dbl>
## 1 Adelie 43.8
## 2 Chinstrap 20.4
## 3 Gentoo 35.7
library(tidyr)
penguins_clean %>%
group_by(species,sex) %>%
summarise(Avg_body_mass = mean(body_mass_g), .groups = "drop") %>%
pivot_wider(names_from = "sex",
values_from = Avg_body_mass,
names_prefix = "Avg_body_mass_")
## # A tibble: 3 Ă— 3
## species Avg_body_mass_female Avg_body_mass_male
## <fct> <dbl> <dbl>
## 1 Adelie 3369. 4043.
## 2 Chinstrap 3527. 3939.
## 3 Gentoo 4680. 5485.
#.groups = "drop" tells dplyr to ungroup the result afterward, so it doesn't stay grouped by species or sex.
ggplot(penguins_clean,aes(bill_length_mm,fill = sex)) +
geom_histogram(bins = 20) +
facet_wrap(~species,nrow = 3)
ggplot(penguins_clean,aes(species,flipper_length_mm)) +
geom_boxplot(colour = "#3366FF",outlier.colour = "red",outlier.shape = 1) +
geom_jitter(width = 0.2, alpha = 0.5, color = "darkblue")
# geom_jitter visualize individual observations alongside the summary (boxplot).
ggplot(penguins_clean,aes(x = body_mass_g, fill = species)) +
geom_density(alpha = 0.5)
ggplot(penguins_clean,aes(bill_depth_mm, fill = species)) +
geom_density() +
facet_grid(species ~island)
ggplot(penguins_clean,aes(x = species, fill = sex)) +
geom_bar() +
scale_fill_manual(values = c(male = "blue", female = "skyblue")) +
geom_text(stat = "count",
aes(label = ..count..),
position = position_stack(vjust = 0.5))
ggplot(penguins_clean,aes(x = species, fill = island)) +
geom_bar() +
scale_fill_manual(values = c(Biscoe = "blue",
Dream = "skyblue",
Torgersen = "navyblue"))+
geom_text(stat = "count",
aes(label = ..count..),
position = position_stack(vjust = 0.5)) +
theme_minimal()
subgroup <- penguins_clean %>%
group_by(species,island,year) %>%
summarise(Avg_body_mass = mean(body_mass_g) , .groups = "drop")
under_risk <- filter(subgroup,Avg_body_mass < 3650) # i.e. below 3650 is under risk.
ggplot(subgroup,aes(year, Avg_body_mass)) +
geom_line(aes(group = interaction(species, island),
colour = species), linewidth = 1.2) +
geom_point(under_risk,mapping = aes(year,Avg_body_mass),
color = "black", size = 3) +
facet_wrap(~island) +
labs(title = "Average Body Mass of Palmer Penguins Over Years by Species and Island",
subtitle = "Black points indicate under-risk subgroups (mean < 3650g)",
x = "Year", y = "Average Body Mass (g)")
#interaction(species, island): Ensures separate lines per species–island combo.