Data with Context :

Variable Name Type Description Units
species Categorical (Factor) Penguin species classification
island Categorical (Factor) Island in the Palmer Archipelago where observation occurred
bill_length_mm Numerical (Continuous) Length of the penguin’s bill (beak) millimeters
bill_depth_mm Numerical (Continuous) Depth (height) of the penguin’s bill millimeters
flipper_length_mm Numerical (Discrete) Length of the penguin’s flipper millimeters
body_mass_g Numerical (Discrete) Body mass of the penguin grams
sex Categorical (Factor) Biological sex of the penguin
year Numerical (Discrete) Year the observation was recorded year
library(palmerpenguins)
?palmerpenguins
penguins
## # A tibble: 344 × 8
##    species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##    <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
##  1 Adelie  Torgersen           39.1          18.7               181        3750
##  2 Adelie  Torgersen           39.5          17.4               186        3800
##  3 Adelie  Torgersen           40.3          18                 195        3250
##  4 Adelie  Torgersen           NA            NA                  NA          NA
##  5 Adelie  Torgersen           36.7          19.3               193        3450
##  6 Adelie  Torgersen           39.3          20.6               190        3650
##  7 Adelie  Torgersen           38.9          17.8               181        3625
##  8 Adelie  Torgersen           39.2          19.6               195        4675
##  9 Adelie  Torgersen           34.1          18.1               193        3475
## 10 Adelie  Torgersen           42            20.2               190        4250
## # ℹ 334 more rows
## # ℹ 2 more variables: sex <fct>, year <int>
?penguins
boxplot(
  penguins$bill_depth_mm,
  horizontal = TRUE,
  main = "Boxplot of Penguin Bill Depth"
)

par(mfrow = c(2, 1), mar = c(4, 4, 2, 1))

hist(
  penguins$bill_depth_mm,
  main = "Histogram of Penguin Bill Depth",
  xlab = "Bill Depth (mm)"
)

boxplot(
  penguins$bill_depth_mm,
  horizontal = TRUE,
  main = "",
  xlab = "Bill Depth (mm)"
)

par(mfrow = c(2, 1), mar = c(4, 4, 2, 1))

hist(
  penguins$bill_length_mm,
  main = "Histogram of Penguin Bill Depth",
  xlab = "Bill length (mm)"
)

boxplot(
  penguins$bill_length_mm,
  horizontal = TRUE,
  main = "",
  xlab = "Bill length (mm)"
)

library(palmerpenguins)

# Remove missing values
penguins_clean <- na.omit(penguins[, c("bill_length_mm", "species")])

# Define colors for each species
species_colors <- c(
  "Adelie" = "tomato",
  "Chinstrap" = "darkseagreen3",
  "Gentoo" = "steelblue"
)

# Ensure consistent ordering
penguins_clean$species <- factor(
  penguins_clean$species,
  levels = c("Adelie", "Chinstrap", "Gentoo")
)

# Set layout: 2 rows (histograms and boxplots), 3 columns (species)
par(mfrow = c(2, 3), mar = c(4, 4, 3, 1))

# Histograms by species
for (sp in levels(penguins_clean$species)) {
  hist(
    penguins_clean$bill_length_mm[penguins_clean$species == sp],
    col = species_colors[sp],
    main = paste("Histogram -", sp),
    xlab = "Bill length (mm)",
    xlim = range(penguins_clean$bill_length_mm),
    breaks = 15,
    border = "white"
  )
}

# Boxplots by species (one per panel)
for (sp in levels(penguins_clean$species)) {
  boxplot(
    penguins_clean$bill_length_mm[penguins_clean$species == sp],
    horizontal = TRUE,
    col = species_colors[sp],
    main = paste("Boxplot -", sp),
    xlab = "Bill length (mm)"
  )
}

# set up plotting window: 1 row, 3 columns
par(mfrow = c(1, 3))

# Adelie
hist(
  penguins$bill_length_mm[penguins$species == "Adelie"],
  col = "skyblue",
  main = "Adelie",
  xlab = "Bill Length (mm)"
)

# Chinstrap
hist(
  penguins$bill_length_mm[penguins$species == "Chinstrap"],
  col = "orange",
  main = "Chinstrap",
  xlab = "Bill Length (mm)"
)

# Gentoo
hist(
  penguins$bill_length_mm[penguins$species == "Gentoo"],
  col = "purple",
  main = "Gentoo",
  xlab = "Bill Length (mm)"
)

# reset plotting layout
par(mfrow = c(1, 1))
# Load necessary package (optional if using base R only)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
penguins_clean <- na.omit(penguins)

# Set up plotting area: 1 row, 3 columns
par(mfrow = c(1, 3))

species_list <- c("Adelie", "Chinstrap", "Gentoo")

for (sp in species_list) {

  species_data <- penguins_clean %>%
    filter(species == sp)

  male <- species_data %>%
    filter(sex == "male") %>%
    pull(bill_length_mm)

  female <- species_data %>%
    filter(sex == "female") %>%
    pull(bill_length_mm)

  # Plot male histogram
  hist(
    male,
    breaks = 15,
    col = rgb(0, 0, 1, 0.6),
    xlim = range(c(male, female)),
    main = sp,
    xlab = "Bill Length (mm)"
  )

  # Add female histogram on top
  hist(
    female,
    breaks = 15,
    col = rgb(1, 0, 0, 0.6),
    add = TRUE
  )

  legend(
    "topright",
    legend = c("Male", "Female"),
    fill = c(rgb(0, 0, 1, 0.6), rgb(1, 0, 0, 0.6))
  )
}

# Overall title
mtext(
  "Bill Length Split by Species",
  outer = TRUE,
  line = -2,
  cex = 1.2
)

# Reset plotting layout
par(mfrow = c(1, 1))
library(ggplot2)
library(dplyr)
library(tidyr)

penguins %>%
  drop_na(bill_length_mm, species, sex) %>%
  ggplot(aes(x = bill_length_mm, fill = sex)) +
  geom_histogram(bins = 14, color = "white", alpha = 0.8) +
  facet_grid(sex ~ species) +
  labs(
    title = "Distribution of Penguin Bill Length by Species and Gender",
    x = "Bill length (mm)",
    y = "Count",
    fill = "Gender"
  ) +
  theme_minimal()

# Set up a 2x2 plotting layout
par(mfrow = c(2, 2))

hist(
  penguins$bill_length_mm,
  breaks = 30,
  main = "Histogram of Bill Length (mm)",
  xlab = "Bill Length (mm)"
)

hist(
  penguins$bill_depth_mm,
  breaks = 30,
  main = "Histogram of Bill Depth (mm)",
  xlab = "Bill Depth (mm)"
)

hist(
  penguins$flipper_length_mm,
  breaks = 30,
  main = "Histogram of Flipper Length (mm)",
  xlab = "Flipper Length (mm)"
)

hist(
  penguins$body_mass_g,
  breaks = 30,
  main = "Histogram of Body Mass (g)",
  xlab = "Body Mass (g)"
)

# Reset layout
par(mfrow = c(1, 1))

Generate new feature :

unique(penguins$island)
## [1] Torgersen Biscoe    Dream    
## Levels: Biscoe Dream Torgersen
unique(penguins$species)
## [1] Adelie    Gentoo    Chinstrap
## Levels: Adelie Chinstrap Gentoo
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.5
## ✔ lubridate 1.9.3     ✔ stringr   1.5.1
## ✔ purrr     1.1.0     ✔ tibble    3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
penguins |>
  select(
    species,
    bill_length_mm,
    bill_depth_mm,
    flipper_length_mm,
    body_mass_g
  ) |>
  pivot_longer(
    cols = -species,
    names_to = "variable",
    values_to = "value"
  ) |>
  ggplot(aes(x = value, fill = species)) +
  geom_histogram(
    bins = 30,
    position = "identity",
    alpha = 0.6
  ) +
  facet_wrap(~ variable, scales = "free") +
  labs(
    title = "Histograms of Penguin Measurements by Species",
    x = "Value",
    y = "Count",
    fill = "Species"
  ) +
  theme_minimal()
## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_bin()`).

library(tidyverse)

penguins |>
  select(
    island,
    bill_length_mm,
    bill_depth_mm,
    flipper_length_mm,
    body_mass_g
  ) |>
  pivot_longer(
    cols = -island,
    names_to = "variable",
    values_to = "value"
  ) |>
  ggplot(aes(x = value, fill = island)) +
  geom_histogram(
    bins = 30,
    position = "identity",
    alpha = 0.6
  ) +
  facet_wrap(~ variable, scales = "free") +
  labs(
    title = "Histograms of Penguin Measurements by Island",
    x = "Value",
    y = "Count",
    fill = "Island"
  ) +
  theme_minimal()
## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_bin()`).

library(tidyverse)

# Create summary table
bill_length_stats <-
  penguins |>
  group_by(species) |>
  summarize(
    mean = mean(bill_length_mm, na.rm = TRUE),
    sd   = sd(bill_length_mm, na.rm = TRUE),
    .groups = "drop"
  ) |>
  mutate(
    label = paste0(
      "Mean: ", round(mean, 2),
      "\nSD: ", round(sd, 2)
    )
  )

species_bill_length <-
  penguins |>
  ggplot(aes(x = bill_length_mm)) +
  geom_histogram(
    bins = 30,
    fill = "steelblue",
    color = "white"
  ) +
  facet_wrap(~ species) +
  geom_text(
    data = bill_length_stats,
    aes(
      x = Inf,
      y = Inf,
      label = label
    ),
    hjust = 1.1,
    vjust = 1.1,
    inherit.aes = FALSE
  ) +
  labs(
    title = "Bill Length by Species",
    x = "Bill Length (mm)",
    y = "Count"
  ) +
  theme_minimal()
species_bill_length
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).

library(tidyverse)

# Remove NA values
penguins_clean <-
  penguins |>
  drop_na(bill_length_mm, sex)

# Summary table (overall + by sex)
bill_length_stats <-
  penguins_clean |>
  group_by(species, sex) |>
  summarize(
    mean = mean(bill_length_mm),
    sd   = sd(bill_length_mm),
    .groups = "drop"
  ) |>
  mutate(
    label = paste0(
      "Mean: ", round(mean, 2),
      "\nSD: ", round(sd, 2)
    )
  )

species_bill_length <-
  penguins_clean |>
  ggplot(aes(x = bill_length_mm, fill = sex)) +
  geom_histogram(
    bins = 20,
    color = "white"
  ) +
  facet_wrap(~ species + sex, ncol = 2) +
  geom_text(
    data = bill_length_stats,
    aes(
      x = Inf,
      y = Inf,
      label = label
    ),
    hjust = 1.1,
    vjust = 1.1,
    inherit.aes = FALSE
  ) +
  labs(
    title = "Bill Length by Species and Sex",
    x = "Bill Length (mm)",
    y = "Count"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

species_bill_length

library(tidyverse)

# Create summary table grouped by island
bill_length_stats <-
  penguins |>
  group_by(island) |>
  summarize(
    mean = mean(bill_length_mm, na.rm = TRUE),
    sd   = sd(bill_length_mm, na.rm = TRUE),
    .groups = "drop"
  ) |>
  mutate(
    label = paste0(
      "Mean: ", round(mean, 2),
      "\nSD: ", round(sd, 2)
    )
  )

# Plot
island_bill_length <- 
  penguins |>
  ggplot(aes(x = bill_length_mm)) +
  geom_histogram(
    bins = 30,
    fill = "steelblue",
    color = "white"
  ) +
  facet_wrap(~ island) +
  geom_text(
    data = bill_length_stats,
    aes(
      x = Inf,
      y = Inf,
      label = label
    ),
    hjust = 1.1,
    vjust = 1.1,
    inherit.aes = FALSE
  ) +
  labs(
    title = "Bill Length by Island",
    x = "Bill Length (mm)",
    y = "Count"
  ) +
  theme_minimal()
island_bill_length
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).

library(tidyverse)

species_bill_length_stats <-
  penguins |>
  group_by(species) |>
  summarize(
    mean = mean(bill_length_mm, na.rm = TRUE),
    sd   = sd(bill_length_mm, na.rm = TRUE),
    max_count = max(hist(bill_length_mm, plot = FALSE)$counts),
    .groups = "drop"
  ) |>
  mutate(
    label = paste0(
      "Mean: ", round(mean, 2),
      "\nSD: ", round(sd, 2)
    )
  )

species_bill_length_plot <-
  penguins |>
  ggplot(aes(x = bill_length_mm)) +
  geom_histogram(
    bins = 30,
    fill = "steelblue",
    color = "white"
  ) +
  facet_wrap(~ species) +
  geom_label(
    data = species_bill_length_stats,
    aes(
      x = Inf,
      y = max_count,
      label = label
    ),
    hjust = 1.1,
    vjust = 1.1,
    inherit.aes = FALSE,
    fill = "white",
    label.size = 0.3
  ) +
  labs(
    title = "Bill Length by Species",
    x = "Bill Length (mm)",
    y = "Count"
  ) +
  theme_minimal()
## Warning: The `label.size` argument of `geom_label()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
island_bill_length_stats <-
  penguins |>
  group_by(island) |>
  summarize(
    mean = mean(bill_length_mm, na.rm = TRUE),
    sd   = sd(bill_length_mm, na.rm = TRUE),
    max_count = max(hist(bill_length_mm, plot = FALSE)$counts),
    .groups = "drop"
  ) |>
  mutate(
    label = paste0(
      "Mean: ", round(mean, 2),
      "\nSD: ", round(sd, 2)
    )
  )

island_bill_length_plot <-
  penguins |>
  ggplot(aes(x = bill_length_mm)) +
  geom_histogram(
    bins = 30,
    fill = "steelblue",
    color = "white"
  ) +
  facet_wrap(~ island) +
  geom_label(
    data = island_bill_length_stats,
    aes(
      x = Inf,
      y = max_count,
      label = label
    ),
    hjust = 1.1,
    vjust = 1.1,
    inherit.aes = FALSE,
    fill = "white",
    label.size = 0.3
  ) +
  labs(
    title = "Bill Length by Island",
    x = "Bill Length (mm)",
    y = "Count"
  ) +
  theme_minimal()

library(patchwork)

species_bill_length_plot /
  island_bill_length_plot
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).

library(tidyverse)

# Create summary statistics
bill_depth_stats <-
  penguins |>
  group_by(species) |>
  summarize(
    mean = mean(bill_depth_mm, na.rm = TRUE),
    sd   = sd(bill_depth_mm, na.rm = TRUE),
    .groups = "drop"
  ) |>
  mutate(
    label = paste0(
      "Mean: ", round(mean, 2),
      "\nSD: ", round(sd, 2)
    )
  )

# Plot
penguins |>
  ggplot(aes(x = bill_depth_mm)) +
  geom_histogram(
    bins = 30,
    fill = "steelblue",
    color = "white"
  ) +
  facet_wrap(~ species) +
  geom_text(
    data = bill_depth_stats,
    aes(
      x = Inf,
      y = Inf,
      label = label
    ),
    hjust = 1.1,
    vjust = 1.1,
    inherit.aes = FALSE
  ) +
  labs(
    title = "Bill Depth by Species",
    x = "Bill Depth (mm)",
    y = "Count"
  ) +
  theme_minimal()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).

library(tidyverse)

# Remove NA values
penguins_clean <-
  penguins |>
  drop_na(bill_depth_mm, sex)

# Summary statistics by species and sex
bill_depth_stats <-
  penguins_clean |>
  group_by(species, sex) |>
  summarize(
    mean = mean(bill_depth_mm),
    sd   = sd(bill_depth_mm),
    .groups = "drop"
  ) |>
  mutate(
    label = paste0(
      "Mean: ", round(mean, 2),
      "\nSD: ", round(sd, 2)
    )
  )

species_bill_depth <-
  penguins_clean |>
  ggplot(aes(x = bill_depth_mm, fill = sex)) +
  geom_histogram(
    bins = 13,
    color = "white"
  ) +
  facet_wrap(~ species + sex, ncol = 2) +
  geom_text(
    data = bill_depth_stats,
    aes(
      x = Inf,
      y = Inf,
      label = label
    ),
    hjust = 1.1,
    vjust = 1.1,
    inherit.aes = FALSE
  ) +
  labs(
    title = "Bill Depth by Species and Sex",
    x = "Bill Depth (mm)",
    y = "Count"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

species_bill_depth

library(tidyverse)

# Create summary statistics grouped by island
bill_depth_stats <-
  penguins |>
  group_by(island) |>
  summarize(
    mean = mean(bill_depth_mm, na.rm = TRUE),
    sd   = sd(bill_depth_mm, na.rm = TRUE),
    .groups = "drop"
  ) |>
  mutate(
    label = paste0(
      "Mean: ", round(mean, 2),
      "\nSD: ", round(sd, 2)
    )
  )

# Plot
penguins |>
  ggplot(aes(x = bill_depth_mm)) +
  geom_histogram(
    bins = 30,
    fill = "steelblue",
    color = "white"
  ) +
  facet_wrap(~ island) +
  geom_text(
    data = bill_depth_stats,
    aes(
      x = Inf,
      y = Inf,
      label = label
    ),
    hjust = 1.1,
    vjust = 1.1,
    inherit.aes = FALSE
  ) +
  labs(
    title = "Bill Depth by Island",
    x = "Bill Depth (mm)",
    y = "Count"
  ) +
  theme_minimal()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).

library(tidyverse)

# Create summary statistics
flipper_length_stats <-
  penguins |>
  group_by(species) |>
  summarize(
    mean = mean(flipper_length_mm, na.rm = TRUE),
    sd   = sd(flipper_length_mm, na.rm = TRUE),
    .groups = "drop"
  ) |>
  mutate(
    label = paste0(
      "Mean: ", round(mean, 2),
      "\nSD: ", round(sd, 2)
    )
  )

# Plot
penguins |>
  ggplot(aes(x = flipper_length_mm)) +
  geom_histogram(
    bins = 30,
    fill = "steelblue",
    color = "white"
  ) +
  facet_wrap(~ species) +
  geom_text(
    data = flipper_length_stats,
    aes(
      x = Inf,
      y = Inf,
      label = label
    ),
    hjust = 1.1,
    vjust = 1.1,
    inherit.aes = FALSE
  ) +
  labs(
    title = "Flipper Length by Species",
    x = "Flipper Length (mm)",
    y = "Count"
  ) +
  theme_minimal()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).

library(tidyverse)

# Create summary statistics grouped by island
flipper_length_stats <-
  penguins |>
  group_by(island) |>
  summarize(
    mean = mean(flipper_length_mm, na.rm = TRUE),
    sd   = sd(flipper_length_mm, na.rm = TRUE),
    .groups = "drop"
  ) |>
  mutate(
    label = paste0(
      "Mean: ", round(mean, 2),
      "\nSD: ", round(sd, 2)
    )
  )

# Plot
penguins |>
  ggplot(aes(x = flipper_length_mm)) +
  geom_histogram(
    bins = 30,
    fill = "steelblue",
    color = "white"
  ) +
  facet_wrap(~ island) +
  geom_text(
    data = flipper_length_stats,
    aes(
      x = Inf,
      y = Inf,
      label = label
    ),
    hjust = 1.1,
    vjust = 1.1,
    inherit.aes = FALSE
  ) +
  labs(
    title = "Flipper Length by Island",
    x = "Flipper Length (mm)",
    y = "Count"
  ) +
  theme_minimal()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).

library(tidyverse)

# Remove NA values
penguins_clean <-
  penguins |>
  drop_na(bill_length_mm, sex)

# Summary statistics by species and sex
bill_length_stats <-
  penguins_clean |>
  group_by(species, sex) |>
  summarize(
    mean = mean(bill_length_mm),
    sd   = sd(bill_length_mm),
    .groups = "drop"
  ) |>
  mutate(
    label = paste0(
      "Mean: ", round(mean, 2),
      "\nSD: ", round(sd, 2)
    )
  )

species_bill_length <-
  penguins_clean |>
  ggplot(aes(x = bill_length_mm, fill = sex)) +
  geom_histogram(
    bins = 20,          # <-- changed to 20 bins
    color = "white"
  ) +
  facet_wrap(~ species + sex, ncol = 2) +
  geom_text(
    data = bill_length_stats,
    aes(
      x = Inf,
      y = Inf,
      label = label
    ),
    hjust = 1.1,
    vjust = 1.1,
    inherit.aes = FALSE
  ) +
  labs(
    title = "Bill Length by Species and Sex",
    x = "Bill Length (mm)",
    y = "Count"
  ) +
  theme_minimal() +
  theme(legend.position = "none")

species_bill_length

Regression Example :

# Clear Linear Association : body_mass_g and flipper_length_mm
plot(penguins$body_mass_g, penguins$flipper_length_mm)

cor(penguins$bill_length_mm, penguins$bill_depth_mm, use = "complete.obs")
## [1] -0.2350529

Simpsons Paradox : Regression Example :

# Remove missing values
penguins_clean <- na.omit(penguins)

# Use factor levels (ensures real species names)
species_levels <- levels(penguins_clean$species)

species_colors <- c("blue", "darkgreen", "purple")

# Empty plot
plot(
  penguins_clean$bill_depth_mm,
  penguins_clean$bill_length_mm,
  type = "n",
  xlab = "Bill Depth (mm)",
  ylab = "Bill Length (mm)",
  main = "Simpson's Paradox"
)

# Add species points + lines
for (i in seq_along(species_levels)) {
  
  sp <- species_levels[i]
  
  species_data <- subset(
    penguins_clean,
    species == sp
  )
  
  points(
    species_data$bill_depth_mm,
    species_data$bill_length_mm,
    col = species_colors[i],
    pch = 16
  )
  
  species_fit <- lm(
    bill_length_mm ~ bill_depth_mm,
    data = species_data
  )
  
  abline(
    species_fit,
    col = species_colors[i],
    lwd = 2
  )
}

# Global regression line (red)
global_fit <- lm(
  bill_length_mm ~ bill_depth_mm,
  data = penguins_clean
)

abline(
  global_fit,
  col = "red",
  lwd = 3
)

# Legend with actual species names
legend(
  "topleft",
  legend = c(species_levels, "Global"),
  col = c(species_colors, "red"),
  lwd = c(rep(2, length(species_levels)), 3),
  pch = c(rep(16, length(species_levels)), NA),
  bty = "n"
)

Multivariable thinking & Model Comparison :

All models are wrong, but some are useful

George E. P. Box

mdl1 <- lm(bill_depth_mm ~ bill_length_mm, dat = penguins)
mdl2 <- lm(bill_depth_mm ~ bill_length_mm + species, dat = penguins)
mdl3 <- lm(bill_depth_mm ~ bill_length_mm + island, dat = penguins)
mdl4 <- lm(bill_depth_mm ~ bill_length_mm + sex, dat = penguins)
mdl5 <- lm(bill_depth_mm ~ bill_length_mm + species + island, dat = penguins)
mdl6 <- lm(bill_depth_mm ~ bill_length_mm + species + sex, dat = penguins)
mdl7 <- lm(bill_depth_mm ~ bill_length_mm + species + island, dat = penguins)
mdl8 <- lm(bill_depth_mm ~ bill_length_mm + species + island + sex, dat = penguins)

summary(mdl1)
## 
## Call:
## lm(formula = bill_depth_mm ~ bill_length_mm, data = penguins)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1381 -1.4263  0.0164  1.3841  4.5255 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    20.88547    0.84388  24.749  < 2e-16 ***
## bill_length_mm -0.08502    0.01907  -4.459 1.12e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.922 on 340 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.05525,    Adjusted R-squared:  0.05247 
## F-statistic: 19.88 on 1 and 340 DF,  p-value: 1.12e-05
plot(mdl1, which = c(1,2))

summary(mdl2)
## 
## Call:
## lm(formula = bill_depth_mm ~ bill_length_mm + species, data = penguins)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4529 -0.6864 -0.0508  0.5519  3.5915 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      10.59218    0.68302  15.508  < 2e-16 ***
## bill_length_mm    0.19989    0.01749  11.427  < 2e-16 ***
## speciesChinstrap -1.93319    0.22416  -8.624 2.55e-16 ***
## speciesGentoo    -5.10602    0.19142 -26.674  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9533 on 338 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.769,  Adjusted R-squared:  0.7669 
## F-statistic: 375.1 on 3 and 338 DF,  p-value: < 2.2e-16
plot(mdl2,  which = c(1,2))

summary(mdl3)
## 
## Call:
## lm(formula = bill_depth_mm ~ bill_length_mm + island, data = penguins)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2539 -1.2381 -0.0759  0.8819  5.0908 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     17.41079    0.75453  23.075   <2e-16 ***
## bill_length_mm  -0.03394    0.01647  -2.061   0.0401 *  
## islandDream      2.43252    0.18189  13.373   <2e-16 ***
## islandTorgersen  2.34053    0.26544   8.818   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.527 on 338 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.4074, Adjusted R-squared:  0.4022 
## F-statistic: 77.46 on 3 and 338 DF,  p-value: < 2.2e-16
plot(mdl3, which = c(1,2))

summary(mdl4)
## 
## Call:
## lm(formula = bill_depth_mm ~ bill_length_mm + sex, data = penguins)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4055 -1.3440 -0.0007  1.1859  4.1253 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    22.56139    0.76355  29.548  < 2e-16 ***
## bill_length_mm -0.14576    0.01787  -8.155 7.35e-15 ***
## sexmale         2.01334    0.19519  10.315  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.672 on 330 degrees of freedom
##   (11 observations deleted due to missingness)
## Multiple R-squared:  0.2833, Adjusted R-squared:  0.279 
## F-statistic: 65.23 on 2 and 330 DF,  p-value: < 2.2e-16
plot(mdl4, which = c(1,2))

summary(mdl5)
## 
## Call:
## lm(formula = bill_depth_mm ~ bill_length_mm + species + island, 
##     data = penguins)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4500 -0.6824 -0.0527  0.5737  3.5390 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      10.59154    0.69947  15.142  < 2e-16 ***
## bill_length_mm    0.19959    0.01756  11.365  < 2e-16 ***
## speciesChinstrap -1.89334    0.25035  -7.563  3.8e-13 ***
## speciesGentoo    -5.09080    0.22502 -22.624  < 2e-16 ***
## islandDream      -0.02422    0.19274  -0.126    0.900    
## islandTorgersen   0.06375    0.19667   0.324    0.746    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9558 on 336 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.7692, Adjusted R-squared:  0.7657 
## F-statistic: 223.9 on 5 and 336 DF,  p-value: < 2.2e-16
plot(mdl5, which = c(1,2))

summary(mdl6)
## 
## Call:
## lm(formula = bill_depth_mm ~ bill_length_mm + species + sex, 
##     data = penguins)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.10170 -0.54971 -0.07144  0.49233  2.92621 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      15.07161    0.72264  20.856  < 2e-16 ***
## bill_length_mm    0.06824    0.01942   3.514 0.000504 ***
## speciesChinstrap -0.60971    0.22855  -2.668 0.008015 ** 
## speciesGentoo    -3.96308    0.19686 -20.132  < 2e-16 ***
## sexmale           1.25285    0.11489  10.904  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8188 on 328 degrees of freedom
##   (11 observations deleted due to missingness)
## Multiple R-squared:  0.8292, Adjusted R-squared:  0.8271 
## F-statistic: 398.1 on 4 and 328 DF,  p-value: < 2.2e-16
plot(mdl6, which = c(1,2))

summary(mdl7)
## 
## Call:
## lm(formula = bill_depth_mm ~ bill_length_mm + species + island, 
##     data = penguins)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4500 -0.6824 -0.0527  0.5737  3.5390 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      10.59154    0.69947  15.142  < 2e-16 ***
## bill_length_mm    0.19959    0.01756  11.365  < 2e-16 ***
## speciesChinstrap -1.89334    0.25035  -7.563  3.8e-13 ***
## speciesGentoo    -5.09080    0.22502 -22.624  < 2e-16 ***
## islandDream      -0.02422    0.19274  -0.126    0.900    
## islandTorgersen   0.06375    0.19667   0.324    0.746    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9558 on 336 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.7692, Adjusted R-squared:  0.7657 
## F-statistic: 223.9 on 5 and 336 DF,  p-value: < 2.2e-16
plot(mdl7, which = c(1,2))

summary(mdl8)
## 
## Call:
## lm(formula = bill_depth_mm ~ bill_length_mm + species + island + 
##     sex, data = penguins)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0037 -0.5569 -0.0817  0.4990  2.9183 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      15.14954    0.73540  20.600  < 2e-16 ***
## bill_length_mm    0.06647    0.01949   3.411  0.00073 ***
## speciesChinstrap -0.49350    0.25050  -1.970  0.04968 *  
## speciesGentoo    -3.96088    0.22056 -17.959  < 2e-16 ***
## islandDream      -0.11167    0.16600  -0.673  0.50160    
## islandTorgersen   0.08981    0.17189   0.522  0.60169    
## sexmale           1.26059    0.11515  10.947  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8194 on 326 degrees of freedom
##   (11 observations deleted due to missingness)
## Multiple R-squared:   0.83,  Adjusted R-squared:  0.8269 
## F-statistic: 265.3 on 6 and 326 DF,  p-value: < 2.2e-16
plot(mdl8, which = c(1,2))

colnames(penguins)
## [1] "species"           "island"            "bill_length_mm"   
## [4] "bill_depth_mm"     "flipper_length_mm" "body_mass_g"      
## [7] "sex"               "year"

Compare mdls

penguins_clean <- penguins |> 
  drop_na(bill_depth_mm, bill_length_mm, species, sex)
mdl2_new <- lm(bill_depth_mm ~ bill_length_mm + species, data = penguins_clean)
mdl6_new <- lm(bill_depth_mm ~ bill_length_mm + species + sex, data = penguins_clean)

anova(mdl2_new, mdl6_new)
## Analysis of Variance Table
## 
## Model 1: bill_depth_mm ~ bill_length_mm + species
## Model 2: bill_depth_mm ~ bill_length_mm + species + sex
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    329 299.62                                  
## 2    328 219.90  1    79.718 118.91 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Conditional Prob Example :

Dependent : Categorical :

library(dplyr)
unique(penguins$species)
## [1] Adelie    Gentoo    Chinstrap
## Levels: Adelie Chinstrap Gentoo
unique(penguins$island)
## [1] Torgersen Biscoe    Dream    
## Levels: Biscoe Dream Torgersen
tbl_species_island <- table(penguins$species, penguins$island)
addmargins(tbl_species_island)
##            
##             Biscoe Dream Torgersen Sum
##   Adelie        44    56        52 152
##   Chinstrap      0    68         0  68
##   Gentoo       124     0         0 124
##   Sum          168   124        52 344
chisq.test(tbl_species_island)
## 
##  Pearson's Chi-squared test
## 
## data:  tbl_species_island
## X-squared = 299.55, df = 4, p-value < 2.2e-16

Independent : Categorical :

unique(penguins$sex)
## [1] male   female <NA>  
## Levels: female male
unique(penguins$species)
## [1] Adelie    Gentoo    Chinstrap
## Levels: Adelie Chinstrap Gentoo
tbl_sex_species <- table(penguins$sex, penguins$species)
addmargins(tbl_sex_species)
##         
##          Adelie Chinstrap Gentoo Sum
##   female     73        34     58 165
##   male       73        34     61 168
##   Sum       146        68    119 333
chisq.test(tbl_sex_species)
## 
##  Pearson's Chi-squared test
## 
## data:  tbl_sex_species
## X-squared = 0.048607, df = 2, p-value = 0.976

Conditional Prob Example :

library(ggplot2)
penguins |> ggplot(aes(bill_depth_mm)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).

penguins |> ggplot(aes(bill_depth_mm)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).

numeric <-
  penguins |> 
  select(where(is.numeric)) |>
  select(-year)

plot(numeric)

Simulated Data :

set.seed(123)  

Perfect Regression :

Normal Dist Example

set.seed(123)  
x <- rnorm(n = 400, mean = 10, sd = 2)
y <- 2*x + 4
df <- data.frame(x=x,y=y)
head(df, 5)
##           x        y
## 1  8.879049 21.75810
## 2  9.539645 23.07929
## 3 13.117417 30.23483
## 4 10.141017 24.28203
## 5 10.258575 24.51715
plot(x,y)

# write.csv(df, "simulated_regression_example1.csv")

Signal + Noise Example :

rand_noise_normal <- rnorm(n = 400, mean = 0, sd = 1)
x <- rnorm(n = 400, mean = 10, sd = 2)
y <- 2*x + 4 + rand_noise_normal
hist(y)

plot(x,y)

Heteroskedastic Noise – larger variance as X inc.

set.seed(123)

# Generate x
x <- rnorm(n = 400, mean = 10, sd = 2)

# One-direction V-shaped noise (grows as x increases)
noise <- rnorm(400, mean = 0, sd = (x - min(x)))

# Generate y
y <- 2*x + 4 + noise

# Plot
plot(x, y)
abline(lm(y ~ x), col = "red")

Heteroskedastic Noise – larger variance as X dec.

set.seed(123)

# Generate x
x <- rnorm(n = 400, mean = 10, sd = 2)

# One-direction V-shaped noise (grows as x decreases)
noise <- rnorm(400, mean = 0, sd = (max(x) - x))

# Generate y
y <- 2*x + 4 + noise

# Plot
plot(x, y)
abline(lm(y ~ x), col = "red")

Right Skew Exponential Dist Regression Example :

x <- rexp(400, rate = 1)
hist(x)

y <- 2*x + 4
hist(y)

plot(x,y)

Left Skew Regression Example :

x <- -rexp(400, rate = 1)
hist(x)

y <- 2*x + 4
hist(y)

plot(x,y)