Step 1: Load Required Packages

library(palmerpenguins)
## 
## Attaching package: 'palmerpenguins'
## The following objects are masked from 'package:datasets':
## 
##     penguins, penguins_raw
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

Step 2: Load and Inspect the Dataset

data("penguins")
head(penguins)
str(penguins)
## tibble [344 × 8] (S3: tbl_df/tbl/data.frame)
##  $ species          : Factor w/ 3 levels "Adelie","Chinstrap",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ island           : Factor w/ 3 levels "Biscoe","Dream",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ bill_length_mm   : num [1:344] 39.1 39.5 40.3 NA 36.7 39.3 38.9 39.2 34.1 42 ...
##  $ bill_depth_mm    : num [1:344] 18.7 17.4 18 NA 19.3 20.6 17.8 19.6 18.1 20.2 ...
##  $ flipper_length_mm: int [1:344] 181 186 195 NA 193 190 181 195 193 190 ...
##  $ body_mass_g      : int [1:344] 3750 3800 3250 NA 3450 3650 3625 4675 3475 4250 ...
##  $ sex              : Factor w/ 2 levels "female","male": 2 1 1 NA 1 2 1 2 NA NA ...
##  $ year             : int [1:344] 2007 2007 2007 2007 2007 2007 2007 2007 2007 2007 ...
skim_without_charts(penguins)
Data summary
Name penguins
Number of rows 344
Number of columns 8
_______________________
Column type frequency:
factor 3
numeric 5
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
species 0 1.00 FALSE 3 Ade: 152, Gen: 124, Chi: 68
island 0 1.00 FALSE 3 Bis: 168, Dre: 124, Tor: 52
sex 11 0.97 FALSE 2 mal: 168, fem: 165

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
bill_length_mm 2 0.99 43.92 5.46 32.1 39.23 44.45 48.5 59.6
bill_depth_mm 2 0.99 17.15 1.97 13.1 15.60 17.30 18.7 21.5
flipper_length_mm 2 0.99 200.92 14.06 172.0 190.00 197.00 213.0 231.0
body_mass_g 2 0.99 4201.75 801.95 2700.0 3550.00 4050.00 4750.0 6300.0
year 0 1.00 2008.03 0.82 2007.0 2007.00 2008.00 2009.0 2009.0

Step 3: Clean the Data

penguins_clean <- penguins %>%
  clean_names() %>%
  drop_na()

summary(penguins_clean)
##       species          island    bill_length_mm  bill_depth_mm  
##  Adelie   :146   Biscoe   :163   Min.   :32.10   Min.   :13.10  
##  Chinstrap: 68   Dream    :123   1st Qu.:39.50   1st Qu.:15.60  
##  Gentoo   :119   Torgersen: 47   Median :44.50   Median :17.30  
##                                  Mean   :43.99   Mean   :17.16  
##                                  3rd Qu.:48.60   3rd Qu.:18.70  
##                                  Max.   :59.60   Max.   :21.50  
##  flipper_length_mm  body_mass_g       sex           year     
##  Min.   :172       Min.   :2700   female:165   Min.   :2007  
##  1st Qu.:190       1st Qu.:3550   male  :168   1st Qu.:2007  
##  Median :197       Median :4050                Median :2008  
##  Mean   :201       Mean   :4207                Mean   :2008  
##  3rd Qu.:213       3rd Qu.:4775                3rd Qu.:2009  
##  Max.   :231       Max.   :6300                Max.   :2009

Step 4: Summary Statistics by Species

penguins_clean_summary <- penguins_clean %>%
  group_by(species) %>%
  summarize(
    count = n(),
    mean_flipper = mean(flipper_length_mm),
    mean_mass = mean(body_mass_g),
    sd_flipper = sd(flipper_length_mm),
    sd_mass = sd(body_mass_g)
  )
penguins_clean_summary

Step 5: Visualizations

Flipper Length by Species

ggplot(penguins_clean, aes(x = species, y = flipper_length_mm, fill = species)) +
  geom_boxplot() +
  labs(title = "Flipper Length by Species", y = "Flipper Length (mm)") +
  theme_minimal()

Body Mass by Species and Sex

ggplot(penguins_clean, aes(x = species, y = body_mass_g, fill = sex)) +
  geom_boxplot() +
  labs(title = "Body Mass by Species and Sex", y = "Body Mass (g)") +
  theme_minimal()

Flipper Length vs Body Mass (Overall)

ggplot(penguins_clean, aes(x = flipper_length_mm, y = body_mass_g, color = species, shape = sex)) +
  geom_point(size = 3) +
  labs(title = "Flipper Length vs Body Mass", x = "Flipper Length (mm)", y = "Body Mass (g)") +
  theme_light()

Faceted by Species

ggplot(penguins_clean, aes(x = flipper_length_mm, y = body_mass_g, color = species, shape = sex)) +
  geom_point(size = 3) +
  facet_wrap(~species) +
  labs(title = "Flipper Length vs Body Mass by Species", x = "Flipper Length (mm)", y = "Body Mass (g)") +
  theme_light()

Faceted by Sex

ggplot(penguins_clean, aes(x = flipper_length_mm, y = body_mass_g, color = species, shape = sex)) +
  geom_point(size = 3) +
  facet_wrap(~sex) +
  labs(title = "Flipper Length vs Body Mass by Sex", x = "Flipper Length (mm)", y = "Body Mass (g)") +
  theme_light()

Step 6: Hypothesis

We hypothesize that flipper length is positively correlated with body mass in penguins and that species and sex significantly influence this relationship.

Step 7: Create Indicator Variables & Fit Model

penguins_model <- penguins_clean %>%
  mutate(
    species_adelie = as.integer(species == "Adelie"),
    species_chinstrap = as.integer(species == "Chinstrap"),
    species_gentoo = as.integer(species == "Gentoo"),
    sex_female = as.integer(sex == "female"),
    sex_male = as.integer(sex == "male")
  )

model <- lm(body_mass_g ~ flipper_length_mm + species_gentoo + species_chinstrap + sex_male, data = penguins_model)
summary(model)
## 
## Call:
## lm(formula = body_mass_g ~ flipper_length_mm + species_gentoo + 
##     species_chinstrap + sex_male, data = penguins_model)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -721.8 -195.7   -5.9  198.6  873.7 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -365.817    532.050  -0.688   0.4922    
## flipper_length_mm   20.025      2.846   7.037 1.15e-11 ***
## species_gentoo     836.260     85.185   9.817  < 2e-16 ***
## species_chinstrap  -87.634     46.347  -1.891   0.0595 .  
## sex_male           530.381     37.810  14.027  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 295.6 on 328 degrees of freedom
## Multiple R-squared:  0.8669, Adjusted R-squared:  0.8653 
## F-statistic:   534 on 4 and 328 DF,  p-value: < 2.2e-16

Step 8: Model Diagnostics

Residuals vs Fitted

plot(model, which = 1)

Scale-Location Plot

plot(model, which = 3)

Normal Q-Q Plot

plot(model, which = 2)

Residual Density Plot

residuals_df <- data.frame(resid = residuals(model))

ggplot(residuals_df, aes(x = resid)) +
  geom_density(fill = "skyblue", alpha = 0.5, color = "black") +
  stat_function(fun = dnorm,
                args = list(mean = mean(residuals_df$resid), 
                            sd = sd(residuals_df$resid)),
                color = "red", linetype = "dashed", size = 1) +
  labs(title = "Density Plot of Model Residuals",
       x = "Residuals",
       y = "Density") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Step 9: Predicted vs Actual

penguins_model <- penguins_model %>%
  mutate(predicted_mass = predict(model))

ggplot(penguins_model, aes(x = predicted_mass, y = body_mass_g, color = species, shape = sex)) +
  geom_point(alpha = 0.7) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  labs(title = "Actual vs. Predicted Body Mass by Species and Sex",
       x = "Predicted Mass (g)",
       y = "Actual Mass (g)") +
  theme_light()

Conclusion

This analysis explored the relationship between flipper length and body mass in penguins using the Palmer Penguins dataset. Through data cleaning, visualization, and linear modeling, we found a clear positive correlation between flipper length and body mass. Our hypothesis that species and sex significantly influence body mass was supported by the data, as the linear model revealed statistically significant effects for both predictors.

Specifically, the Gentoo species was notably heavier than the Adelie baseline, while male penguins consistently weighed more than females. The model explained approximately 86% of the variance in body mass, suggesting a strong fit.

Visual diagnostics confirmed that the model largely conforms to the key assumptions of Ordinary Least Squares regression, although a mild indication of non-linearity may be present. Future modeling could explore potential interaction effects or nonlinear transformations (e.g., polynomial terms or splines) to further refine predictive performance.

Overall, this study demonstrates that flipper length, species, and sex are meaningful predictors of penguin body mass and showcases the value of tidy data workflows, exploratory visualization, and regression modeling in ecological data analysis.