#Install and load packages
library(tidyverse)
library(stats)
library(rrcov)
library(readxl)#Reads excel data
Part 1. Use tidyverse to create a summary dataset that shows the average of bill_length_mm, bill_depth_mm, and flipper_length_mm by species
df <- read_excel("homework_2_penguins.xlsx")
# copy original data into working dataframe, ignore NAs, and format categorical info
penguins <- na.omit(df)
penguins[sapply(penguins, is.character)] <- lapply(penguins[sapply(penguins, is.character)], as.factor)
penguins$year <- as.factor(penguins$year)
summary(penguins)
## species island bill_length_mm bill_depth_mm
## Adelie :146 Biscoe :163 Min. :32.10 Min. :13.10
## Chinstrap: 68 Dream :123 1st Qu.:39.50 1st Qu.:15.60
## Gentoo :119 Torgersen: 47 Median :44.50 Median :17.30
## Mean :43.99 Mean :17.16
## 3rd Qu.:48.60 3rd Qu.:18.70
## Max. :59.60 Max. :21.50
## flipper_length_mm body_mass_g sex year
## Min. :172 Min. :2700 female:165 2007:103
## 1st Qu.:190 1st Qu.:3550 male :168 2008:113
## Median :197 Median :4050 2009:117
## Mean :201 Mean :4207
## 3rd Qu.:213 3rd Qu.:4775
## Max. :231 Max. :6300
The data has penguin body data for three species collected at three islands over three years. It is almost equally divided between females and males.
Let’s put this information into a new dataframe by species and summarize the body size variables.
p_summary <- penguins %>%
group_by(species) %>%
summarise(
ave_bill_length_mm = mean(bill_length_mm),
ave_bill_depth_mm = mean(bill_depth_mm),
ave_flipper_length_mm = mean(flipper_length_mm),
ave_body_mass_g = mean(body_mass_g)
)
p_summary
## # A tibble: 3 × 5
## species ave_bill_length_mm ave_bill_depth_mm ave_flipper_length_mm
## <fct> <dbl> <dbl> <dbl>
## 1 Adelie 38.8 18.3 190.
## 2 Chinstrap 48.8 18.4 196.
## 3 Gentoo 47.6 15.0 217.
## # ℹ 1 more variable: ave_body_mass_g <dbl>
Part 2. Create boxplots that compare penguins by sex for bill_length_mm, bill_depth_mm, and flipper_length_mm
# create plot for bill length vs. sex
ggplot(data=penguins, aes(y=bill_length_mm, x=sex)) +
geom_boxplot() +
theme_classic() +
scale_x_discrete("Sex", labels = c("Female", "Male")) +
scale_y_continuous("Bill Length (mm)", breaks=seq(30,60,by=5)) +
theme(panel.grid.major.y = element_line(color = "gray",
size = 0.25,
linetype = 1)) +
ggtitle("Comparison of Penguin Bill Length and Sex")
# create plot for bill depth vs. sex
ggplot(data=penguins, aes(y=bill_depth_mm, x=sex)) +
geom_boxplot() +
theme_classic() +
scale_x_discrete("Sex", labels = c("Female", "Male")) +
scale_y_continuous("Bill Depth (mm)", breaks=seq(13,23,by=1)) +
theme(panel.grid.major.y = element_line(color = "gray",
size = 0.25,
linetype = 1)) +
ggtitle("Comparison of Penguin Bill Depth and Sex")
# create plot for flipper length vs. sex
ggplot(data=penguins, aes(y=flipper_length_mm, x=sex)) +
geom_boxplot() +
theme_classic() +
scale_x_discrete("Sex", labels = c("Female", "Male")) +
scale_y_continuous("Flipper Length (mm)", breaks=seq(170,235,by=10)) +
theme(panel.grid.major.y = element_line(color = "gray",
size = 0.25,
linetype = 1)) +
ggtitle("Comparison of Penguin Flipper Length and Sex")
Part 3. Run t.tests to compare penguins by sex for bill_length_mm, bill_depth_mm, and flipper_length_mm
Is there a significant difference between the sexes for bill length, bill depth, and flipper length? Performing t-tests on the grouped data will return p-values. If the p-values is small, traditionally < 0.05, then there is statistical evidence that there is a difference between the two groups.
t.test(bill_length_mm ~ sex, data = penguins)
##
## Welch Two Sample t-test
##
## data: bill_length_mm by sex
## t = -6.6725, df = 329.29, p-value = 1.066e-10
## alternative hypothesis: true difference in means between group female and group male is not equal to 0
## 95 percent confidence interval:
## -4.865676 -2.649908
## sample estimates:
## mean in group female mean in group male
## 42.09697 45.85476
t.test(bill_depth_mm ~ sex, data = penguins)
##
## Welch Two Sample t-test
##
## data: bill_depth_mm by sex
## t = -7.309, df = 330.88, p-value = 2.036e-12
## alternative hypothesis: true difference in means between group female and group male is not equal to 0
## 95 percent confidence interval:
## -1.860077 -1.071157
## sample estimates:
## mean in group female mean in group male
## 16.42545 17.89107
t.test(flipper_length_mm ~ sex, data = penguins)
##
## Welch Two Sample t-test
##
## data: flipper_length_mm by sex
## t = -4.8079, df = 325.28, p-value = 2.336e-06
## alternative hypothesis: true difference in means between group female and group male is not equal to 0
## 95 percent confidence interval:
## -10.064811 -4.219821
## sample estimates:
## mean in group female mean in group male
## 197.3636 204.5060
For all three variables the p-values are << 0.05, which means there is a statistical difference between the size of these features and the sex of the penguins.