library(palmerpenguins)
##
## Attaching package: 'palmerpenguins'
## The following objects are masked from 'package:datasets':
##
## penguins, penguins_raw
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(skimr)
library(janitor)
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
data("penguins")
head(penguins)
str(penguins)
## tibble [344 × 8] (S3: tbl_df/tbl/data.frame)
## $ species : Factor w/ 3 levels "Adelie","Chinstrap",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ island : Factor w/ 3 levels "Biscoe","Dream",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ bill_length_mm : num [1:344] 39.1 39.5 40.3 NA 36.7 39.3 38.9 39.2 34.1 42 ...
## $ bill_depth_mm : num [1:344] 18.7 17.4 18 NA 19.3 20.6 17.8 19.6 18.1 20.2 ...
## $ flipper_length_mm: int [1:344] 181 186 195 NA 193 190 181 195 193 190 ...
## $ body_mass_g : int [1:344] 3750 3800 3250 NA 3450 3650 3625 4675 3475 4250 ...
## $ sex : Factor w/ 2 levels "female","male": 2 1 1 NA 1 2 1 2 NA NA ...
## $ year : int [1:344] 2007 2007 2007 2007 2007 2007 2007 2007 2007 2007 ...
skim_without_charts(penguins)
| Name | penguins |
| Number of rows | 344 |
| Number of columns | 8 |
| _______________________ | |
| Column type frequency: | |
| factor | 3 |
| numeric | 5 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| species | 0 | 1.00 | FALSE | 3 | Ade: 152, Gen: 124, Chi: 68 |
| island | 0 | 1.00 | FALSE | 3 | Bis: 168, Dre: 124, Tor: 52 |
| sex | 11 | 0.97 | FALSE | 2 | mal: 168, fem: 165 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| bill_length_mm | 2 | 0.99 | 43.92 | 5.46 | 32.1 | 39.23 | 44.45 | 48.5 | 59.6 |
| bill_depth_mm | 2 | 0.99 | 17.15 | 1.97 | 13.1 | 15.60 | 17.30 | 18.7 | 21.5 |
| flipper_length_mm | 2 | 0.99 | 200.92 | 14.06 | 172.0 | 190.00 | 197.00 | 213.0 | 231.0 |
| body_mass_g | 2 | 0.99 | 4201.75 | 801.95 | 2700.0 | 3550.00 | 4050.00 | 4750.0 | 6300.0 |
| year | 0 | 1.00 | 2008.03 | 0.82 | 2007.0 | 2007.00 | 2008.00 | 2009.0 | 2009.0 |
penguins_clean <- penguins %>%
clean_names() %>%
drop_na()
summary(penguins_clean)
## species island bill_length_mm bill_depth_mm
## Adelie :146 Biscoe :163 Min. :32.10 Min. :13.10
## Chinstrap: 68 Dream :123 1st Qu.:39.50 1st Qu.:15.60
## Gentoo :119 Torgersen: 47 Median :44.50 Median :17.30
## Mean :43.99 Mean :17.16
## 3rd Qu.:48.60 3rd Qu.:18.70
## Max. :59.60 Max. :21.50
## flipper_length_mm body_mass_g sex year
## Min. :172 Min. :2700 female:165 Min. :2007
## 1st Qu.:190 1st Qu.:3550 male :168 1st Qu.:2007
## Median :197 Median :4050 Median :2008
## Mean :201 Mean :4207 Mean :2008
## 3rd Qu.:213 3rd Qu.:4775 3rd Qu.:2009
## Max. :231 Max. :6300 Max. :2009
penguins_clean_summary <- penguins_clean %>%
group_by(species) %>%
summarize(
count = n(),
mean_flipper = mean(flipper_length_mm),
mean_mass = mean(body_mass_g),
sd_flipper = sd(flipper_length_mm),
sd_mass = sd(body_mass_g)
)
penguins_clean_summary
ggplot(penguins_clean, aes(x = species, y = flipper_length_mm, fill = species)) +
geom_boxplot() +
labs(title = "Flipper Length by Species", y = "Flipper Length (mm)") +
theme_minimal()
ggplot(penguins_clean, aes(x = species, y = body_mass_g, fill = sex)) +
geom_boxplot() +
labs(title = "Body Mass by Species and Sex", y = "Body Mass (g)") +
theme_minimal()
ggplot(penguins_clean, aes(x = flipper_length_mm, y = body_mass_g, color = species, shape = sex)) +
geom_point(size = 3) +
labs(title = "Flipper Length vs Body Mass", x = "Flipper Length (mm)", y = "Body Mass (g)") +
theme_light()
ggplot(penguins_clean, aes(x = flipper_length_mm, y = body_mass_g, color = species, shape = sex)) +
geom_point(size = 3) +
facet_wrap(~species) +
labs(title = "Flipper Length vs Body Mass by Species", x = "Flipper Length (mm)", y = "Body Mass (g)") +
theme_light()
ggplot(penguins_clean, aes(x = flipper_length_mm, y = body_mass_g, color = species, shape = sex)) +
geom_point(size = 3) +
facet_wrap(~sex) +
labs(title = "Flipper Length vs Body Mass by Sex", x = "Flipper Length (mm)", y = "Body Mass (g)") +
theme_light()
We hypothesize that flipper length is positively correlated with body mass in penguins and that species and sex significantly influence this relationship.
penguins_model <- penguins_clean %>%
mutate(
species_adelie = as.integer(species == "Adelie"),
species_chinstrap = as.integer(species == "Chinstrap"),
species_gentoo = as.integer(species == "Gentoo"),
sex_female = as.integer(sex == "female"),
sex_male = as.integer(sex == "male")
)
model <- lm(body_mass_g ~ flipper_length_mm + species_gentoo + species_chinstrap + sex_male, data = penguins_model)
summary(model)
##
## Call:
## lm(formula = body_mass_g ~ flipper_length_mm + species_gentoo +
## species_chinstrap + sex_male, data = penguins_model)
##
## Residuals:
## Min 1Q Median 3Q Max
## -721.8 -195.7 -5.9 198.6 873.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -365.817 532.050 -0.688 0.4922
## flipper_length_mm 20.025 2.846 7.037 1.15e-11 ***
## species_gentoo 836.260 85.185 9.817 < 2e-16 ***
## species_chinstrap -87.634 46.347 -1.891 0.0595 .
## sex_male 530.381 37.810 14.027 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 295.6 on 328 degrees of freedom
## Multiple R-squared: 0.8669, Adjusted R-squared: 0.8653
## F-statistic: 534 on 4 and 328 DF, p-value: < 2.2e-16
plot(model, which = 1)
plot(model, which = 3)
plot(model, which = 2)
residuals_df <- data.frame(resid = residuals(model))
ggplot(residuals_df, aes(x = resid)) +
geom_density(fill = "skyblue", alpha = 0.5, color = "black") +
stat_function(fun = dnorm,
args = list(mean = mean(residuals_df$resid),
sd = sd(residuals_df$resid)),
color = "red", linetype = "dashed", size = 1) +
labs(title = "Density Plot of Model Residuals",
x = "Residuals",
y = "Density") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
penguins_model <- penguins_model %>%
mutate(predicted_mass = predict(model))
ggplot(penguins_model, aes(x = predicted_mass, y = body_mass_g, color = species, shape = sex)) +
geom_point(alpha = 0.7) +
geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
labs(title = "Actual vs. Predicted Body Mass by Species and Sex",
x = "Predicted Mass (g)",
y = "Actual Mass (g)") +
theme_light()
This analysis explored the relationship between flipper length and body mass in penguins using the Palmer Penguins dataset. Through data cleaning, visualization, and linear modeling, we found a clear positive correlation between flipper length and body mass. Our hypothesis that species and sex significantly influence body mass was supported by the data, as the linear model revealed statistically significant effects for both predictors.
Specifically, the Gentoo species was notably heavier than the Adelie baseline, while male penguins consistently weighed more than females. The model explained approximately 86% of the variance in body mass, suggesting a strong fit.
Visual diagnostics confirmed that the model largely conforms to the key assumptions of Ordinary Least Squares regression, although a mild indication of non-linearity may be present. Future modeling could explore potential interaction effects or nonlinear transformations (e.g., polynomial terms or splines) to further refine predictive performance.
Overall, this study demonstrates that flipper length, species, and sex are meaningful predictors of penguin body mass and showcases the value of tidy data workflows, exploratory visualization, and regression modeling in ecological data analysis.