“The Association Between Bill Size and Body Mass of Penguins: With differing effects depending on the Species, Diet, and Island”
This is my research to explore how bill size of penguins influences body mass in relation to their species, diet, and island. I hypothesize that bill size is positively associated with the body mass of penguins, with varying effects depending on species, diet, and island. The results of the statistical analysis support my hypothesis that bill size is positively associated with body mass, with varying effects depending on species, diet, and island. For example, according to the linear regression model, there is approximately 70.27% of the variability in body mass (Adjusted R-squared = 0.7027). The overall model was highly significant, F-statistic is 208.8, with 39 and 3390 degrees of freedom, and the result was highly significant at p-value < 2.2e-16. This indicates that the combination of predictor variables effectively accounted for a substantial proportion of the variance in body mass.The ANOVA results also confirmed the significance of those variables, including bill size (F = 1752.61, p < 2.2e-16), species (F = 43.50, p < 2.2e-16), and diet (F = 542.27, p < 2.2e-16), as well as interactions such as bill size:species (F = 5.30, p = 0.005) and bill size:diet (F = 9.50, p < 0.001).
```{r}
penguins <- read.csv(“palmerpenguins.csv”, header = TRUE)
head(penguins) # first 6 rows tail(penguins) # last 6 rows colnames(penguins) # view column names rownames(penguins) # view row names
str(penguins)# structure, giving a concise summary of this information, showing data types and first few values of each variable
penguins <- read.csv(“palmerpenguins.csv”, header = TRUE)
str(penguins) # This will show the structure of the dataset, including variable names and data types
num_variables <- ncol(penguins) # Count the number of columns (variables) cat(“The dataset has”, num_variables, “variables.”)
categorical.vars <- names(penguins)[sapply(penguins, is.character) | sapply(penguins, is.factor)] cat(“Categorical variables:”, categorical.vars, “”)
cat(“within each categorical variable:”) cat(“—————————”) # Print the variable name for (var in categorical.vars) {cat(“Variable:”, var, “”) print(unique(penguins[[var]])) # Show unique levels cat(“—————————”) }
summary(penguins)
sapply(penguins, function(x) if (is.factor(x)) levels(x)) # lists levels for categorical variables
table(penguins\(species) # counts of species table(penguins\)island) # counts of islands table(penguins\(sex) table(penguins\)diet) table(penguins\(life_stage) table(penguins\)health_metrics) table(penguins$year)
library(dplyr)
numerical_data <- penguins %>% select(bill_length_mm, bill_depth_mm, flipper_length_mm, body_mass_g)
summary_stats <- numerical_data %>% summarise( mean_bill_length = mean(bill_length_mm, na.rm = TRUE), sd_bill_length = sd(bill_length_mm, na.rm = TRUE), mean_bill_depth = mean(bill_depth_mm, na.rm = TRUE), sd_bill_depth = sd(bill_depth_mm, na.rm = TRUE), mean_flipper_length = mean(flipper_length_mm, na.rm = TRUE), sd_flipper_length = sd(flipper_length_mm, na.rm = TRUE), mean_body_mass = mean(body_mass_g, na.rm = TRUE), sd_body_mass = sd(body_mass_g, na.rm = TRUE) )
print(summary_stats)
m_body_mass <- mean(penguins\(body_mass_g, na.rm=TRUE) # Calculate mean std_body_mass <- sd(penguins\)body_mass_g, na.rm=TRUE) # Calculate standard deviation
hist(penguins$body_mass_g, density=20, breaks=20, prob=TRUE, xlab=“Body Mass (g)”, ylim=c(0, 0.0005), main=“Normal Curve over Body Mass Histogram”) curve(dnorm(x, mean=m_body_mass, sd=std_body_mass), col=“red”, lwd=2, add=TRUE, yaxt=“n”)
citation() citation(“dplyr”) citation(“pkgname”)
m_bill_length <- mean(penguins\(bill_length_mm, na.rm=TRUE) std_bill_length <- sd(penguins\)bill_length_mm, na.rm=TRUE)
hist(penguins$bill_length_mm, density=20, breaks=20, prob=TRUE, xlab=“Bill Length (mm)”, ylim=c(0, 0.06), main=“Normal Curve over Bill Length Histogram”) curve(dnorm(x, mean=m_bill_length, sd=std_bill_length), col=“red”, lwd=2, add=TRUE, yaxt=“n”)
m_bill_depth <- mean(penguins\(bill_depth_mm, na.rm=TRUE) std_bill_depth <- sd(penguins\)bill_depth_mm, na.rm=TRUE)
hist(penguins$bill_depth_mm, density=20, breaks=20, prob=TRUE, xlab=“Bill Depth (mm)”, ylim=c(0, 0.12), main=“Normal Curve over Bill Depth Histogram”) curve(dnorm(x, mean=m_bill_depth, sd=std_bill_depth), col=“red”, lwd=2, add=TRUE, yaxt=“n”)
m_flipper_length <- mean(penguins\(flipper_length_mm, na.rm=TRUE) std_flipper_length <- sd(penguins\)flipper_length_mm, na.rm=TRUE)
hist(penguins$flipper_length_mm, density=20, breaks=20, prob=TRUE, xlab=“Flipper Length (mm)”, ylim=c(0, 0.04), main=“Normal Curve over Flipper Length Histogram”) curve(dnorm(x, mean=m_flipper_length, sd=std_flipper_length), col=“red”, lwd=2, add=TRUE, yaxt=“n”)
qqnorm(penguins\(body_mass_g, ylab="Standardized Residuals", xlab="Normal Scores", main="QQ Plot for Body Mass", pch=16) qqline(penguins\)body_mass_g, col=“red”)
qqnorm(penguins\(bill_length_mm, ylab="Standardized Residuals", xlab="Normal Scores", main="QQ Plot for Bill Length", pch=16) qqline(penguins\)bill_length_mm, col=“red”)
qqnorm(penguins\(bill_depth_mm, ylab="Standardized Residuals", xlab="Normal Scores", main="QQ Plot for Bill Depth", pch=16) qqline(penguins\)bill_depth_mm, col=“red”)
qqnorm(penguins\(flipper_length_mm, ylab="Standardized Residuals", xlab="Normal Scores", main="QQ Plot for Flipper Length", pch=16) qqline(penguins\)flipper_length_mm, col=“red”)
library(ggplot2) citation(“ggplot2”)
ggplot(penguins, aes(x = bill_length_mm, y = body_mass_g)) + geom_point(size = 1, alpha = 0.7, color = “blue”) + geom_smooth(method = “lm”, se = TRUE, color = “red”) + # Adds linear regression line labs(title = “Bill Length vs Body Mass”, x = “Bill Length (mm)”, y = “Body Mass (g)”) + theme_minimal()
#Scatterplot for bill depth vs body mass ggplot(penguins, aes(x = bill_depth_mm, y = body_mass_g)) + geom_point(size = 1, alpha = 0.7, color = “pink”) + geom_smooth(method = “lm”, se = TRUE, color = “orange”) + # Adds linear regression line labs(title = “Bill Depth vs Body Mass”, x = “Bill Depth (mm)”, y = “Body Mass (g)”) + theme_minimal()
#Hypothesis: The bill size is positively correlated to the body mass of the penguins with differing effects depending on the species, diet and island. #Dependent variable(y): Body mass (g) #Independent variable(x): Bill size (calculated as bill length × bill dept) #Control variables: Species, Diet, and Island
library(tidyverse) library(ggplot2) library(car)
penguins\(bill_size <- penguins\)bill_length_mm * penguins$bill_depth_mm
penguins <- na.omit(penguins)
penguins\(species <- as.factor(penguins\)species) penguins\(diet <- as.factor(penguins\)diet) penguins\(island <- as.factor(penguins\)island)
model <- lm(body_mass_g ~ bill_size * species * diet * island, data = penguins)
summary(model)
par(mfrow = c(2, 2)) # Split plot into 2x2 grid plot(model) # Check residuals, normality, homoscedasticity
ggplot(penguins, aes(x = bill_size, y = body_mass_g, color = species, shape = diet)) + geom_point() + geom_smooth(method = “lm”, se = FALSE) + facet_wrap(~island) + # Separate plots by island labs(title = “Relationship Between Bill Size and Body Mass Across Islands”, x = “Bill Size (Bill Length x Bill Depth)”, y = “Body Mass (g)”) + theme_minimal()
library(dplyr)
library(dplyr)
penguins %>% group_by(species) %>% summarize(mean_bill_length = mean(bill_length_mm, na.rm = TRUE), mean_bill_depth = mean(bill_depth_mm, na.rm = TRUE), mean_body_mass = mean(body_mass_g, na.rm = TRUE))
citation()
library(car) Anova(model) citation(“car”)
```