library(tidyverse)

df_raw <- read_csv("Penguin_Measures_wIDs copy.csv")

# Pick columns 
outcome_col   <- "body_mass_g"
predictor_col <- "flipper_length_mm"
group_col     <- "species"

df <- df_raw %>%
  select(all_of(c(outcome_col, predictor_col, group_col))) %>%
  rename(
    outcome   = all_of(outcome_col),
    predictor = all_of(predictor_col),
    group     = all_of(group_col)
  ) %>%
  mutate(
    group     = as.factor(group),
    outcome   = as.numeric(outcome),
    predictor = as.numeric(predictor)
  ) %>%
  filter(!is.na(outcome), !is.na(predictor), !is.na(group))

# Quick checks
df %>% slice_head(n = 6)
## # A tibble: 6 × 3
##   outcome predictor group 
##     <dbl>     <dbl> <fct> 
## 1    3400       174 Adelie
## 2    3800       189 Adelie
## 3    3800       187 Adelie
## 4    3200       187 Adelie
## 5    3150       172 Adelie
## 6    4500       211 Gentoo
df %>% count(group)
## # A tibble: 3 × 2
##   group         n
##   <fct>     <int>
## 1 Adelie      151
## 2 Chinstrap    68
## 3 Gentoo      123
# Model
lm_result <- lm(outcome ~ predictor + group, data = df)
summary(lm_result)
## 
## Call:
## lm(formula = outcome ~ predictor + group, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -927.70 -254.82  -23.92  241.16 1191.68 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -4031.477    584.151  -6.901 2.55e-11 ***
## predictor         40.705      3.071  13.255  < 2e-16 ***
## groupChinstrap  -206.510     57.731  -3.577 0.000398 ***
## groupGentoo      266.810     95.264   2.801 0.005392 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 375.5 on 338 degrees of freedom
## Multiple R-squared:  0.7826, Adjusted R-squared:  0.7807 
## F-statistic: 405.7 on 3 and 338 DF,  p-value: < 2.2e-16
# Plot (this will appear in the knitted HTML)
ggplot(df, aes(x = predictor, y = outcome, color = group)) +
  geom_point(alpha = 0.75) +
  geom_smooth(method = "lm", se = FALSE) +
  facet_wrap(~ group) +
  labs(
    title = "Body mass vs Flipper length by Species",
    x = "Flipper length (mm)",
    y = "Body mass (g)",
    color = "Species"
  ) +
  theme_minimal()