library(palmerpenguins)
##
## Attaching package: 'palmerpenguins'
## The following objects are masked from 'package:datasets':
##
## penguins, penguins_raw
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.6
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ ggplot2 4.0.1 ✔ tibble 3.3.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.2
## ✔ purrr 1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
data(penguins)
# Dummy Variable Setup, Assisted by Simon
penguins %>%
mutate(
dummy_sex = ifelse(sex=="female", 1, 0)
)
## # A tibble: 344 × 9
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen NA NA NA NA
## 5 Adelie Torgersen 36.7 19.3 193 3450
## 6 Adelie Torgersen 39.3 20.6 190 3650
## 7 Adelie Torgersen 38.9 17.8 181 3625
## 8 Adelie Torgersen 39.2 19.6 195 4675
## 9 Adelie Torgersen 34.1 18.1 193 3475
## 10 Adelie Torgersen 42 20.2 190 4250
## # ℹ 334 more rows
## # ℹ 3 more variables: sex <fct>, year <int>, dummy_sex <dbl>
#cleaning data, Assisted by Simon
penguins_sexclean <- penguins %>%
filter(!is.na(sex),
!is.na(flipper_length_mm),
!is.na(island))
#Mean for flipper length
penguins_sexclean %>%
group_by(sex) %>%
summarise(
mean_flipper = mean(flipper_length_mm)
)
## # A tibble: 2 × 2
## sex mean_flipper
## <fct> <dbl>
## 1 female 197.
## 2 male 205.
#Linear Model for flipper length & sex
lm.Q1 <- lm(flipper_length_mm ~ sex, data = penguins_sexclean)
summary(lm.Q1)
##
## Call:
## lm(formula = flipper_length_mm ~ sex, data = penguins_sexclean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -26.506 -10.364 -4.364 12.636 26.494
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 197.364 1.057 186.792 < 2e-16 ***
## sexmale 7.142 1.488 4.801 2.39e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.57 on 331 degrees of freedom
## Multiple R-squared: 0.06511, Adjusted R-squared: 0.06229
## F-statistic: 23.05 on 1 and 331 DF, p-value: 2.391e-06
#Sanity Check, Assisted by Simon
mean(penguins$flipper_length_mm[penguins$sex=="female"], na.rm=TRUE)
## [1] 197.3636
#Plot for Q1
ggplot(penguins_sexclean, aes(x = sex, y = flipper_length_mm, color = sex)) +
geom_violin(aes(fill = as.factor(sex)),valpha = 0.3, show.legend = FALSE, alpha = 0.2) +
labs(x = "Penguin Sex", y = "Flipper Length (mm)", title = "Sex Differences in Penguin Flipper Length") +
geom_jitter(width = 0.15, alpha = 0.7) +
geom_boxplot(aes(color = as.factor(sex)),width = 0.5, fill = "white", alpha = 0.2) +
#Citation: https://stackoverflow.com/questions/7466023/how-to-give-color-to-each-class-in-scatter-plot-in-r
scale_color_manual(values = c("female" = "skyblue", "male" = "coral")) +
scale_fill_manual(values = c("female" = "skyblue", "male" = "coral")) +
#Citation: https://stackoverflow.com/questions/40675778/center-plot-title-in-ggplot2
theme(plot.title.position = "plot", plot.title = element_text(hjust = 0.5))
~ZQ
Question 2: Are penguins on different islands the same size?
Q2.1: It is different from the first question as it has more than 2 levels in the IV, result in multiple slope coefficients. Equation: Body Mass = b0 + b1(Dream) + b2(Torgersen) + error b0 = mean body mass for penguins in Biscoe b1 = mean difference in body mass between Dream and Biscoe b2 mean difference in body mass between Torgersen and Biscoe
#Linear Model for body mass & island
lm.Q2 <- lm(body_mass_g ~ island, data = penguins_sexclean)
summary(lm.Q2)
##
## Call:
## lm(formula = body_mass_g ~ island, data = penguins_sexclean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1869.17 -368.90 5.83 431.10 1580.83
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4719.17 49.45 95.438 <2e-16 ***
## islandDream -1000.27 75.40 -13.266 <2e-16 ***
## islandTorgersen -1010.66 104.52 -9.669 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 631.3 on 330 degrees of freedom
## Multiple R-squared: 0.389, Adjusted R-squared: 0.3853
## F-statistic: 105.1 on 2 and 330 DF, p-value: < 2.2e-16
Q2.2: if it is OG Psych Stats, it is called one-way ANOVA. The IV is island with three levels and the DV is the body size
#ANOVA for body mass & island
anova.Q2 <- lm(body_mass_g ~ island, data = penguins_sexclean)
summary(anova.Q2)
##
## Call:
## lm(formula = body_mass_g ~ island, data = penguins_sexclean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1869.17 -368.90 5.83 431.10 1580.83
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4719.17 49.45 95.438 <2e-16 ***
## islandDream -1000.27 75.40 -13.266 <2e-16 ***
## islandTorgersen -1010.66 104.52 -9.669 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 631.3 on 330 degrees of freedom
## Multiple R-squared: 0.389, Adjusted R-squared: 0.3853
## F-statistic: 105.1 on 2 and 330 DF, p-value: < 2.2e-16
ggplot(penguins_sexclean, aes(x = island, y = body_mass_g)) +
labs( x = "Island", y = "Body Mass (g)", title = "Penguin Body Mass by Island") +
geom_jitter(width = 0.15, alpha = 0.7)