library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(palmerpenguins)
## 
## Attaching package: 'palmerpenguins'
## The following objects are masked from 'package:datasets':
## 
##     penguins, penguins_raw
options(warn = -1)

Dataset Overview

The Palmer Penguins dataset provides size measurements for three penguin species observed on three islands in the Palmer Archipelago, Antarctica. It includes the following variables:

species: Penguin species (Adelie, Chinstrap, Gentoo) island: Island where the penguin was observed (Biscoe, Dream, Torgersen) bill_length_mm: Bill length (mm) bill_depth_mm: Bill depth (mm) flipper_length_mm: Flipper length (mm) body_mass_g: Body mass (g) sex: Sex (male, female) year: Year of observation

Data Inspection: Load the dataset and display the first few rows.

print(penguins)
## # A tibble: 344 Ă— 8
##    species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##    <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
##  1 Adelie  Torgersen           39.1          18.7               181        3750
##  2 Adelie  Torgersen           39.5          17.4               186        3800
##  3 Adelie  Torgersen           40.3          18                 195        3250
##  4 Adelie  Torgersen           NA            NA                  NA          NA
##  5 Adelie  Torgersen           36.7          19.3               193        3450
##  6 Adelie  Torgersen           39.3          20.6               190        3650
##  7 Adelie  Torgersen           38.9          17.8               181        3625
##  8 Adelie  Torgersen           39.2          19.6               195        4675
##  9 Adelie  Torgersen           34.1          18.1               193        3475
## 10 Adelie  Torgersen           42            20.2               190        4250
## # ℹ 334 more rows
## # ℹ 2 more variables: sex <fct>, year <int>

Missing Values: Identify and count missing values in each column.

missing_value_per_col <- colSums(is.na(penguins))
print(missing_value_per_col)
##           species            island    bill_length_mm     bill_depth_mm 
##                 0                 0                 2                 2 
## flipper_length_mm       body_mass_g               sex              year 
##                 2                 2                11                 0
# Drop missing values
penguins_clean <- na.omit(penguins)

Species Count: Count the number of penguins per species.

penguins %>%  count(species)
## # A tibble: 3 Ă— 2
##   species       n
##   <fct>     <int>
## 1 Adelie      152
## 2 Chinstrap    68
## 3 Gentoo      124

Island Distribution: Visualize the number of penguins on each island using a bar chart.

ggplot(penguins,aes(island)) +
  geom_bar(fill = "blue") +
  geom_text(stat ="count",aes(label = ..count..),vjust = -0.5) +
  labs(title = "Island Distribution")

Bill Length Histogram: Plot a histogram of bill lengths.

ggplot(penguins_clean,aes(bill_length_mm)) +
  geom_histogram(bins = 50)

Flipper Length Boxplot: Create a boxplot of flipper lengths by species.

ggplot(penguins_clean,aes(species,flipper_length_mm))+
  geom_boxplot(colour = "#3366FF",outlier.colour = "red",outlier.shape = 1)

Sex Distribution: Generate a bar chart showing the count of male and female penguins.

ggplot(penguins_clean, aes(sex)) +
  geom_bar(stat = "count", fill = "skyblue") +
  geom_text(stat = "count", aes(label = ..count..), vjust = -0.5) +
  labs(title = "Sex distribution of Penguins")

Body Mass Density: Plot the density distribution of body mass.

ggplot(penguins_clean,aes(body_mass_g)) + geom_density(fill = "cyan")

Scatter Plot: Create a scatter plot of bill length vs. bill depth.

ggplot(penguins_clean,aes(bill_length_mm,bill_depth_mm))+
  geom_point()

## Color by Species: Enhance the scatter plot by coloring points by species.
ggplot(penguins_clean,aes(bill_length_mm,bill_depth_mm,shape = species, color = species))+
  geom_point()

Yearly Observations: Plot the number of observations per year.

ggplot(penguins,aes(year)) +
  geom_bar(stat = "count", fill = "blue") +
  geom_text(stat = "count", aes(label = ..count..), vjust = -0.5) +
  labs(title = "No. of observations per year.")

Sex Ratio per Species: Visualize the sex ratio within each species.

# Sex ratio for Adelie.
male_Adelie <- penguins_clean %>% filter(sex %in% "male" & species %in% "Adelie") %>% count()
female_Adelie <- penguins_clean %>% filter(sex %in% "female" & species %in% "Adelie") %>% count()
sex_ratio_Adelie <- male_Adelie / female_Adelie

# Sex ratio for Chinstrap.
male_Chinstrap <- penguins_clean %>% filter(sex %in% "male" & species %in% "Chinstrap") %>% count()
female_Chinstrap <- penguins_clean %>% filter(sex %in% "female" & species %in% "Chinstrap") %>% count()
sex_ratio_Chinstrap <- male_Chinstrap/female_Chinstrap

# Sex ratio for Gentoo.
male_Gentoo <- penguins_clean %>% filter(sex %in% "male" & species %in% "Gentoo") %>% count()
female_Gentoo <- penguins_clean %>% filter(sex %in% "female" & species %in% "Gentoo") %>% count()
sex_ratio_Gentoo <- male_Gentoo/female_Gentoo


sex_ratio <- data.frame(species = c("Adelie", "Chinstrap", "Gentoo"),
                        sex_ratio = c(as.numeric(sex_ratio_Adelie),
                                      as.numeric(sex_ratio_Chinstrap),
                                      as.numeric(sex_ratio_Gentoo)))
ggplot(sex_ratio,aes(species,sex_ratio)) + geom_bar(stat = "identity") +
  geom_text(aes(label = sex_ratio),vjust=-0.5) + 
  labs(title = "Sex Ratio of each Species")

Simplify by chatgpt then modify by me :)

library(tidyr)
sex_ratio1 <- penguins_clean %>% count(species,sex) %>% 
  pivot_wider(names_from = sex,values_from = n) %>% mutate(sex_ratio1 = male / female) %>%
  select(species,sex_ratio1)

print(sex_ratio1)
## # A tibble: 3 Ă— 2
##   species   sex_ratio1
##   <fct>          <dbl>
## 1 Adelie          1   
## 2 Chinstrap       1   
## 3 Gentoo          1.05
ggplot(sex_ratio1,aes(species,sex_ratio1)) + geom_bar(stat = "identity",fill="blue") +
  geom_text(aes(label = sex_ratio1),vjust=-0.5) + 
  labs(title = "Sex Ratio of each Species")

Bill Length vs. Flipper Length: Plot and analyze the relationship between bill length and flipper length.

ggplot(penguins_clean,aes(bill_length_mm,flipper_length_mm)) +
  geom_point() + geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

Body Mass by Sex: Create boxplots of body mass grouped by sex.

ggplot(penguins_clean,aes(sex, body_mass_g)) + 
  geom_boxplot(colour = "#3366FF",outlier.colour = "red",outlier.shape = 1)

Island-Species Heatmap: Generate a heatmap showing the count of each species on each island.

heatmap_data <- penguins_clean %>% count(species,island)

ggplot(heatmap_data,aes(island, species,fill = n)) + geom_tile(color = "red") +
  scale_fill_gradient(low = "lightblue", high = "steelblue") +
  labs(title = "Island-Species Heatmap",
       x = "Island", y = "Species", fill = "Count")

Correlation Matrix: Compute and visualize the correlation matrix of numerical variables.

library(ggcorrplot)
corr_mat <- round(cor(penguins_clean %>% select(where(is.numeric))),1)
ggcorrplot(corr_mat,method = "square")

Data Filtering: Filter the dataset for penguins with flipper length > 200 mm.

#Base R : penguins_clean[penguins_clean$flipper_length_mm > 200,]
#dplyr
penguins_clean %>% filter(flipper_length_mm > 200)
## # A tibble: 144 Ă— 8
##    species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##    <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
##  1 Adelie  Dream               35.7          18                 202        3550
##  2 Adelie  Dream               41.1          18.1               205        4300
##  3 Adelie  Dream               40.8          18.9               208        4300
##  4 Adelie  Biscoe              41            20                 203        4725
##  5 Adelie  Torgersen           41.4          18.5               202        3875
##  6 Adelie  Torgersen           44.1          18                 210        4000
##  7 Adelie  Dream               41.5          18.5               201        4000
##  8 Gentoo  Biscoe              46.1          13.2               211        4500
##  9 Gentoo  Biscoe              50            16.3               230        5700
## 10 Gentoo  Biscoe              48.7          14.1               210        4450
## # ℹ 134 more rows
## # ℹ 2 more variables: sex <fct>, year <int>

Group-wise Summaries: Calculate the mean and standard deviation of bill length for each species and sex combination.

penguins_clean %>% group_by(species) %>%  
  summarise_at(vars(bill_length_mm),list(Mean = mean, Std = sd))
## # A tibble: 3 Ă— 3
##   species    Mean   Std
##   <fct>     <dbl> <dbl>
## 1 Adelie     38.8  2.66
## 2 Chinstrap  48.8  3.34
## 3 Gentoo     47.6  3.11

Top Observations: Identify the top 5 penguins with the highest body mass within each species.

penguins_clean %>% group_by(species) %>% slice_max(body_mass_g,n=5) %>% 
  arrange(species,desc(body_mass_g)) %>% select(species,body_mass_g)
## # A tibble: 16 Ă— 2
## # Groups:   species [3]
##    species   body_mass_g
##    <fct>           <int>
##  1 Adelie           4775
##  2 Adelie           4725
##  3 Adelie           4700
##  4 Adelie           4675
##  5 Adelie           4650
##  6 Chinstrap        4800
##  7 Chinstrap        4550
##  8 Chinstrap        4500
##  9 Chinstrap        4450
## 10 Chinstrap        4400
## 11 Gentoo           6300
## 12 Gentoo           6050
## 13 Gentoo           6000
## 14 Gentoo           6000
## 15 Gentoo           5950
## 16 Gentoo           5950

Island Comparison: Compare the average body mass of penguins across different islands.

penguins_clean %>% group_by(island) %>% summarise_at(vars(body_mass_g),list(Average_body_mass = mean))
## # A tibble: 3 Ă— 2
##   island    Average_body_mass
##   <fct>                 <dbl>
## 1 Biscoe                4719.
## 2 Dream                 3719.
## 3 Torgersen             3709.

Missing Data Analysis: Determine the percentage of missing values for each variable.

 (colSums(is.na(penguins)) / nrow(penguins)) * 100
##           species            island    bill_length_mm     bill_depth_mm 
##         0.0000000         0.0000000         0.5813953         0.5813953 
## flipper_length_mm       body_mass_g               sex              year 
##         0.5813953         0.5813953         3.1976744         0.0000000

Data Filtering: Filter penguins with bill depth greater than the species average.

penguins_clean %>% group_by(species) %>% mutate(Avg_mean = mean(bill_depth_mm)) %>% 
  filter(bill_depth_mm > Avg_mean) %>% select(species,bill_depth_mm,Avg_mean)
## # A tibble: 171 Ă— 3
## # Groups:   species [3]
##    species bill_depth_mm Avg_mean
##    <fct>           <dbl>    <dbl>
##  1 Adelie           18.7     18.3
##  2 Adelie           19.3     18.3
##  3 Adelie           20.6     18.3
##  4 Adelie           19.6     18.3
##  5 Adelie           21.2     18.3
##  6 Adelie           21.1     18.3
##  7 Adelie           19       18.3
##  8 Adelie           20.7     18.3
##  9 Adelie           18.4     18.3
## 10 Adelie           21.5     18.3
## # ℹ 161 more rows

Data Transformation: Create a new variable indicating whether a penguin’s body mass is above or below the species average.

penguins_clean %>% 
  group_by(species) %>%
  mutate(Avg_body_mass = mean(body_mass_g),
         body_mass_level = if_else(body_mass_g > Avg_body_mass,"Above","Below")) %>% 
  select(species, body_mass_g, Avg_body_mass, body_mass_level)
## # A tibble: 333 Ă— 4
## # Groups:   species [3]
##    species body_mass_g Avg_body_mass body_mass_level
##    <fct>         <int>         <dbl> <chr>          
##  1 Adelie         3750         3706. Above          
##  2 Adelie         3800         3706. Above          
##  3 Adelie         3250         3706. Below          
##  4 Adelie         3450         3706. Below          
##  5 Adelie         3650         3706. Below          
##  6 Adelie         3625         3706. Below          
##  7 Adelie         4675         3706. Above          
##  8 Adelie         3200         3706. Below          
##  9 Adelie         3800         3706. Above          
## 10 Adelie         4400         3706. Above          
## # ℹ 323 more rows

Ranking: Rank penguins within each species based on flipper length.

penguins_clean %>% group_by(species) %>% 
  mutate(Rank =dense_rank(desc(flipper_length_mm))) %>%
  select(species,flipper_length_mm,Rank) %>%
  arrange(species, Rank)
## # A tibble: 333 Ă— 3
## # Groups:   species [3]
##    species flipper_length_mm  Rank
##    <fct>               <int> <int>
##  1 Adelie                210     1
##  2 Adelie                208     2
##  3 Adelie                205     3
##  4 Adelie                203     4
##  5 Adelie                202     5
##  6 Adelie                202     5
##  7 Adelie                201     6
##  8 Adelie                200     7
##  9 Adelie                200     7
## 10 Adelie                199     8
## # ℹ 323 more rows

Proportion Calculation: Calculate the proportion of each species on every island.

penguins_clean %>% count(species) %>%
  mutate(Proportion = round((n / nrow(penguins_clean) * 100),2)) %>% 
  select(species,Proportion)
## # A tibble: 3 Ă— 2
##   species   Proportion
##   <fct>          <dbl>
## 1 Adelie          43.8
## 2 Chinstrap       20.4
## 3 Gentoo          35.7

Data Reshaping: Reshape the dataset to have separate columns for male and female body mass averages per species.

library(tidyr)
penguins_clean %>%
  group_by(species,sex) %>%
  summarise(Avg_body_mass = mean(body_mass_g), .groups = "drop") %>% 
  pivot_wider(names_from = "sex",
              values_from = Avg_body_mass,
              names_prefix = "Avg_body_mass_")
## # A tibble: 3 Ă— 3
##   species   Avg_body_mass_female Avg_body_mass_male
##   <fct>                    <dbl>              <dbl>
## 1 Adelie                   3369.              4043.
## 2 Chinstrap                3527.              3939.
## 3 Gentoo                   4680.              5485.
#.groups = "drop" tells dplyr to ungroup the result afterward, so it doesn't stay grouped by species or sex.

Advanced Visualizations with ggplot2

Faceted Histograms: Create histograms of bill length faceted by species and filled by sex.

ggplot(penguins_clean,aes(bill_length_mm,fill = sex)) +
  geom_histogram(bins = 20) +
  facet_wrap(~species,nrow = 3)

Boxplots with Jitter: Overlay jittered data points on boxplots of flipper length by species.

ggplot(penguins_clean,aes(species,flipper_length_mm)) +
  geom_boxplot(colour = "#3366FF",outlier.colour = "red",outlier.shape = 1) +
  geom_jitter(width = 0.2, alpha = 0.5, color = "darkblue")

# geom_jitter visualize individual observations alongside the summary (boxplot).

Density Plots: Plot density curves of body mass for each species, using transparency to handle overlap.

ggplot(penguins_clean,aes(x = body_mass_g, fill = species)) +
  geom_density(alpha = 0.5)

Facet Grids: Use facet grids to compare bill depth across species and islands.

ggplot(penguins_clean,aes(bill_depth_mm, fill = species)) +
  geom_density() +
  facet_grid(species ~island)

Grouped Bar Charts: Create grouped bar charts to compare the number of male and female penguins across species.

ggplot(penguins_clean,aes(x = species, fill = sex)) +
  geom_bar() +
  scale_fill_manual(values =  c(male = "blue", female = "skyblue")) +
  geom_text(stat = "count",
            aes(label = ..count..),
            position = position_stack(vjust = 0.5))

Data Labels: Add data labels to a bar chart showing the count of penguins per island.

ggplot(penguins_clean,aes(x = species, fill = island)) +
  geom_bar() +
  scale_fill_manual(values = c(Biscoe = "blue",
                                Dream = "skyblue",
                                Torgersen = "navyblue"))+
  geom_text(stat = "count",
            aes(label = ..count..),
            position = position_stack(vjust = 0.5)) +
  theme_minimal()

Identify At-Risk Subgroups: Suppose low body mass indicates health risk. Identify at-risk species-year-island combinations and visualize them.

subgroup <- penguins_clean %>% 
  group_by(species,island,year) %>%  
  summarise(Avg_body_mass = mean(body_mass_g) , .groups = "drop")

under_risk <- filter(subgroup,Avg_body_mass < 3650)  # i.e. below 3650 is under risk.


ggplot(subgroup,aes(year, Avg_body_mass)) + 
  geom_line(aes(group = interaction(species, island),
                colour = species), linewidth = 1.2) +
  geom_point(under_risk,mapping = aes(year,Avg_body_mass),
             color = "black", size = 3) +
  facet_wrap(~island) +
  labs(title = "Average Body Mass of Palmer Penguins Over Years by Species and Island",
       subtitle = "Black points indicate under-risk subgroups (mean < 3650g)",
       x = "Year", y = "Average Body Mass (g)")

#interaction(species, island): Ensures separate lines per species–island combo.