library(palmerpenguins)
## 
## Attaching package: 'palmerpenguins'
## The following objects are masked from 'package:datasets':
## 
##     penguins, penguins_raw
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.2
## ✔ purrr     1.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
data(penguins)
# Dummy Variable Setup, Assisted by Simon
penguins %>%
  mutate(
    dummy_sex = ifelse(sex=="female", 1, 0)
  )
## # A tibble: 344 × 9
##    species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##    <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
##  1 Adelie  Torgersen           39.1          18.7               181        3750
##  2 Adelie  Torgersen           39.5          17.4               186        3800
##  3 Adelie  Torgersen           40.3          18                 195        3250
##  4 Adelie  Torgersen           NA            NA                  NA          NA
##  5 Adelie  Torgersen           36.7          19.3               193        3450
##  6 Adelie  Torgersen           39.3          20.6               190        3650
##  7 Adelie  Torgersen           38.9          17.8               181        3625
##  8 Adelie  Torgersen           39.2          19.6               195        4675
##  9 Adelie  Torgersen           34.1          18.1               193        3475
## 10 Adelie  Torgersen           42            20.2               190        4250
## # ℹ 334 more rows
## # ℹ 3 more variables: sex <fct>, year <int>, dummy_sex <dbl>
#cleaning data, Assisted by Simon
penguins_sexclean <- penguins %>%
    filter(!is.na(sex),
           !is.na(flipper_length_mm),
           !is.na(island))
#Mean for flipper length
penguins_sexclean %>%
group_by(sex) %>%
summarise(
  mean_flipper = mean(flipper_length_mm)
)
## # A tibble: 2 × 2
##   sex    mean_flipper
##   <fct>         <dbl>
## 1 female         197.
## 2 male           205.
#Linear Model for flipper length & sex
lm.Q1 <- lm(flipper_length_mm ~ sex, data = penguins_sexclean)
summary(lm.Q1)
## 
## Call:
## lm(formula = flipper_length_mm ~ sex, data = penguins_sexclean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -26.506 -10.364  -4.364  12.636  26.494 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  197.364      1.057 186.792  < 2e-16 ***
## sexmale        7.142      1.488   4.801 2.39e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.57 on 331 degrees of freedom
## Multiple R-squared:  0.06511,    Adjusted R-squared:  0.06229 
## F-statistic: 23.05 on 1 and 331 DF,  p-value: 2.391e-06
#Sanity Check, Assisted by Simon
mean(penguins$flipper_length_mm[penguins$sex=="female"], na.rm=TRUE)
## [1] 197.3636
#Plot for Q1
ggplot(penguins_sexclean, aes(x = sex, y = flipper_length_mm, color = sex)) + 
  
geom_violin(aes(fill = as.factor(sex)),valpha = 0.3, show.legend = FALSE, alpha = 0.2) +
  
labs(x = "Penguin Sex", y = "Flipper Length (mm)", title = "Sex Differences in Penguin Flipper Length") + 
  
geom_jitter(width = 0.15, alpha = 0.7) +
  
geom_boxplot(aes(color = as.factor(sex)),width = 0.5, fill = "white", alpha = 0.2) +

#Citation: https://stackoverflow.com/questions/7466023/how-to-give-color-to-each-class-in-scatter-plot-in-r
scale_color_manual(values = c("female" = "skyblue", "male" = "coral")) + 
scale_fill_manual(values = c("female" = "skyblue", "male" = "coral")) +

#Citation: https://stackoverflow.com/questions/40675778/center-plot-title-in-ggplot2
theme(plot.title.position = "plot", plot.title = element_text(hjust = 0.5))

~ZQ
Question 2: Are penguins on different islands the same size?

Q2.1: It is different from the first question as it has more than 2 levels in the IV, result in multiple slope coefficients. Equation: Body Mass = b0 + b1(Dream) + b2(Torgersen) + error b0 = mean body mass for penguins in Biscoe b1 = mean difference in body mass between Dream and Biscoe b2 mean difference in body mass between Torgersen and Biscoe

#Linear Model for body mass & island
lm.Q2 <- lm(body_mass_g ~ island, data = penguins_sexclean)
summary(lm.Q2)
## 
## Call:
## lm(formula = body_mass_g ~ island, data = penguins_sexclean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1869.17  -368.90     5.83   431.10  1580.83 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      4719.17      49.45  95.438   <2e-16 ***
## islandDream     -1000.27      75.40 -13.266   <2e-16 ***
## islandTorgersen -1010.66     104.52  -9.669   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 631.3 on 330 degrees of freedom
## Multiple R-squared:  0.389,  Adjusted R-squared:  0.3853 
## F-statistic: 105.1 on 2 and 330 DF,  p-value: < 2.2e-16

Q2.2: if it is OG Psych Stats, it is called one-way ANOVA. The IV is island with three levels and the DV is the body size

#ANOVA for body mass & island
anova.Q2 <- lm(body_mass_g ~ island, data = penguins_sexclean)
summary(anova.Q2)
## 
## Call:
## lm(formula = body_mass_g ~ island, data = penguins_sexclean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1869.17  -368.90     5.83   431.10  1580.83 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      4719.17      49.45  95.438   <2e-16 ***
## islandDream     -1000.27      75.40 -13.266   <2e-16 ***
## islandTorgersen -1010.66     104.52  -9.669   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 631.3 on 330 degrees of freedom
## Multiple R-squared:  0.389,  Adjusted R-squared:  0.3853 
## F-statistic: 105.1 on 2 and 330 DF,  p-value: < 2.2e-16
ggplot(penguins_sexclean, aes(x = island, y = body_mass_g)) +
  labs( x = "Island", y = "Body Mass (g)", title = "Penguin Body Mass by Island") + 
  geom_jitter(width = 0.15, alpha = 0.7)