| Variable Name | Type | Description | Units |
|---|---|---|---|
species |
Categorical (Factor) | Penguin species classification | — |
island |
Categorical (Factor) | Island in the Palmer Archipelago where observation occurred | — |
bill_length_mm |
Numerical (Continuous) | Length of the penguin’s bill (beak) | millimeters |
bill_depth_mm |
Numerical (Continuous) | Depth (height) of the penguin’s bill | millimeters |
flipper_length_mm |
Numerical (Discrete) | Length of the penguin’s flipper | millimeters |
body_mass_g |
Numerical (Discrete) | Body mass of the penguin | grams |
sex |
Categorical (Factor) | Biological sex of the penguin | — |
year |
Numerical (Discrete) | Year the observation was recorded | year |
library(palmerpenguins)
?palmerpenguins
penguins
## # A tibble: 344 × 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen NA NA NA NA
## 5 Adelie Torgersen 36.7 19.3 193 3450
## 6 Adelie Torgersen 39.3 20.6 190 3650
## 7 Adelie Torgersen 38.9 17.8 181 3625
## 8 Adelie Torgersen 39.2 19.6 195 4675
## 9 Adelie Torgersen 34.1 18.1 193 3475
## 10 Adelie Torgersen 42 20.2 190 4250
## # ℹ 334 more rows
## # ℹ 2 more variables: sex <fct>, year <int>
?penguins
boxplot(
penguins$bill_depth_mm,
horizontal = TRUE,
main = "Boxplot of Penguin Bill Depth"
)
par(mfrow = c(2, 1), mar = c(4, 4, 2, 1))
hist(
penguins$bill_depth_mm,
main = "Histogram of Penguin Bill Depth",
xlab = "Bill Depth (mm)"
)
boxplot(
penguins$bill_depth_mm,
horizontal = TRUE,
main = "",
xlab = "Bill Depth (mm)"
)
par(mfrow = c(2, 1), mar = c(4, 4, 2, 1))
hist(
penguins$bill_length_mm,
main = "Histogram of Penguin Bill Depth",
xlab = "Bill length (mm)"
)
boxplot(
penguins$bill_length_mm,
horizontal = TRUE,
main = "",
xlab = "Bill length (mm)"
)
library(palmerpenguins)
# Remove missing values
penguins_clean <- na.omit(penguins[, c("bill_length_mm", "species")])
# Define colors for each species
species_colors <- c(
"Adelie" = "tomato",
"Chinstrap" = "darkseagreen3",
"Gentoo" = "steelblue"
)
# Ensure consistent ordering
penguins_clean$species <- factor(
penguins_clean$species,
levels = c("Adelie", "Chinstrap", "Gentoo")
)
# Set layout: 2 rows (histograms and boxplots), 3 columns (species)
par(mfrow = c(2, 3), mar = c(4, 4, 3, 1))
# Histograms by species
for (sp in levels(penguins_clean$species)) {
hist(
penguins_clean$bill_length_mm[penguins_clean$species == sp],
col = species_colors[sp],
main = paste("Histogram -", sp),
xlab = "Bill length (mm)",
xlim = range(penguins_clean$bill_length_mm),
breaks = 15,
border = "white"
)
}
# Boxplots by species (one per panel)
for (sp in levels(penguins_clean$species)) {
boxplot(
penguins_clean$bill_length_mm[penguins_clean$species == sp],
horizontal = TRUE,
col = species_colors[sp],
main = paste("Boxplot -", sp),
xlab = "Bill length (mm)"
)
}
# set up plotting window: 1 row, 3 columns
par(mfrow = c(1, 3))
# Adelie
hist(
penguins$bill_length_mm[penguins$species == "Adelie"],
col = "skyblue",
main = "Adelie",
xlab = "Bill Length (mm)"
)
# Chinstrap
hist(
penguins$bill_length_mm[penguins$species == "Chinstrap"],
col = "orange",
main = "Chinstrap",
xlab = "Bill Length (mm)"
)
# Gentoo
hist(
penguins$bill_length_mm[penguins$species == "Gentoo"],
col = "purple",
main = "Gentoo",
xlab = "Bill Length (mm)"
)
# reset plotting layout
par(mfrow = c(1, 1))
# Load necessary package (optional if using base R only)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
penguins_clean <- na.omit(penguins)
# Set up plotting area: 1 row, 3 columns
par(mfrow = c(1, 3))
species_list <- c("Adelie", "Chinstrap", "Gentoo")
for (sp in species_list) {
species_data <- penguins_clean %>%
filter(species == sp)
male <- species_data %>%
filter(sex == "male") %>%
pull(bill_length_mm)
female <- species_data %>%
filter(sex == "female") %>%
pull(bill_length_mm)
# Plot male histogram
hist(
male,
breaks = 15,
col = rgb(0, 0, 1, 0.6),
xlim = range(c(male, female)),
main = sp,
xlab = "Bill Length (mm)"
)
# Add female histogram on top
hist(
female,
breaks = 15,
col = rgb(1, 0, 0, 0.6),
add = TRUE
)
legend(
"topright",
legend = c("Male", "Female"),
fill = c(rgb(0, 0, 1, 0.6), rgb(1, 0, 0, 0.6))
)
}
# Overall title
mtext(
"Bill Length Split by Species",
outer = TRUE,
line = -2,
cex = 1.2
)
# Reset plotting layout
par(mfrow = c(1, 1))
library(ggplot2)
library(dplyr)
library(tidyr)
penguins %>%
drop_na(bill_length_mm, species, sex) %>%
ggplot(aes(x = bill_length_mm, fill = sex)) +
geom_histogram(bins = 14, color = "white", alpha = 0.8) +
facet_grid(sex ~ species) +
labs(
title = "Distribution of Penguin Bill Length by Species and Gender",
x = "Bill length (mm)",
y = "Count",
fill = "Gender"
) +
theme_minimal()
# Set up a 2x2 plotting layout
par(mfrow = c(2, 2))
hist(
penguins$bill_length_mm,
breaks = 30,
main = "Histogram of Bill Length (mm)",
xlab = "Bill Length (mm)"
)
hist(
penguins$bill_depth_mm,
breaks = 30,
main = "Histogram of Bill Depth (mm)",
xlab = "Bill Depth (mm)"
)
hist(
penguins$flipper_length_mm,
breaks = 30,
main = "Histogram of Flipper Length (mm)",
xlab = "Flipper Length (mm)"
)
hist(
penguins$body_mass_g,
breaks = 30,
main = "Histogram of Body Mass (g)",
xlab = "Body Mass (g)"
)
# Reset layout
par(mfrow = c(1, 1))
unique(penguins$island)
## [1] Torgersen Biscoe Dream
## Levels: Biscoe Dream Torgersen
unique(penguins$species)
## [1] Adelie Gentoo Chinstrap
## Levels: Adelie Chinstrap Gentoo
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.3 ✔ stringr 1.5.1
## ✔ purrr 1.1.0 ✔ tibble 3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
penguins |>
select(
species,
bill_length_mm,
bill_depth_mm,
flipper_length_mm,
body_mass_g
) |>
pivot_longer(
cols = -species,
names_to = "variable",
values_to = "value"
) |>
ggplot(aes(x = value, fill = species)) +
geom_histogram(
bins = 30,
position = "identity",
alpha = 0.6
) +
facet_wrap(~ variable, scales = "free") +
labs(
title = "Histograms of Penguin Measurements by Species",
x = "Value",
y = "Count",
fill = "Species"
) +
theme_minimal()
## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_bin()`).
library(tidyverse)
penguins |>
select(
island,
bill_length_mm,
bill_depth_mm,
flipper_length_mm,
body_mass_g
) |>
pivot_longer(
cols = -island,
names_to = "variable",
values_to = "value"
) |>
ggplot(aes(x = value, fill = island)) +
geom_histogram(
bins = 30,
position = "identity",
alpha = 0.6
) +
facet_wrap(~ variable, scales = "free") +
labs(
title = "Histograms of Penguin Measurements by Island",
x = "Value",
y = "Count",
fill = "Island"
) +
theme_minimal()
## Warning: Removed 8 rows containing non-finite outside the scale range
## (`stat_bin()`).
library(tidyverse)
# Create summary table
bill_length_stats <-
penguins |>
group_by(species) |>
summarize(
mean = mean(bill_length_mm, na.rm = TRUE),
sd = sd(bill_length_mm, na.rm = TRUE),
.groups = "drop"
) |>
mutate(
label = paste0(
"Mean: ", round(mean, 2),
"\nSD: ", round(sd, 2)
)
)
species_bill_length <-
penguins |>
ggplot(aes(x = bill_length_mm)) +
geom_histogram(
bins = 30,
fill = "steelblue",
color = "white"
) +
facet_wrap(~ species) +
geom_text(
data = bill_length_stats,
aes(
x = Inf,
y = Inf,
label = label
),
hjust = 1.1,
vjust = 1.1,
inherit.aes = FALSE
) +
labs(
title = "Bill Length by Species",
x = "Bill Length (mm)",
y = "Count"
) +
theme_minimal()
species_bill_length
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).
library(tidyverse)
# Remove NA values
penguins_clean <-
penguins |>
drop_na(bill_length_mm, sex)
# Summary table (overall + by sex)
bill_length_stats <-
penguins_clean |>
group_by(species, sex) |>
summarize(
mean = mean(bill_length_mm),
sd = sd(bill_length_mm),
.groups = "drop"
) |>
mutate(
label = paste0(
"Mean: ", round(mean, 2),
"\nSD: ", round(sd, 2)
)
)
species_bill_length <-
penguins_clean |>
ggplot(aes(x = bill_length_mm, fill = sex)) +
geom_histogram(
bins = 20,
color = "white"
) +
facet_wrap(~ species + sex, ncol = 2) +
geom_text(
data = bill_length_stats,
aes(
x = Inf,
y = Inf,
label = label
),
hjust = 1.1,
vjust = 1.1,
inherit.aes = FALSE
) +
labs(
title = "Bill Length by Species and Sex",
x = "Bill Length (mm)",
y = "Count"
) +
theme_minimal() +
theme(legend.position = "none")
species_bill_length
library(tidyverse)
# Create summary table grouped by island
bill_length_stats <-
penguins |>
group_by(island) |>
summarize(
mean = mean(bill_length_mm, na.rm = TRUE),
sd = sd(bill_length_mm, na.rm = TRUE),
.groups = "drop"
) |>
mutate(
label = paste0(
"Mean: ", round(mean, 2),
"\nSD: ", round(sd, 2)
)
)
# Plot
island_bill_length <-
penguins |>
ggplot(aes(x = bill_length_mm)) +
geom_histogram(
bins = 30,
fill = "steelblue",
color = "white"
) +
facet_wrap(~ island) +
geom_text(
data = bill_length_stats,
aes(
x = Inf,
y = Inf,
label = label
),
hjust = 1.1,
vjust = 1.1,
inherit.aes = FALSE
) +
labs(
title = "Bill Length by Island",
x = "Bill Length (mm)",
y = "Count"
) +
theme_minimal()
island_bill_length
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).
library(tidyverse)
species_bill_length_stats <-
penguins |>
group_by(species) |>
summarize(
mean = mean(bill_length_mm, na.rm = TRUE),
sd = sd(bill_length_mm, na.rm = TRUE),
max_count = max(hist(bill_length_mm, plot = FALSE)$counts),
.groups = "drop"
) |>
mutate(
label = paste0(
"Mean: ", round(mean, 2),
"\nSD: ", round(sd, 2)
)
)
species_bill_length_plot <-
penguins |>
ggplot(aes(x = bill_length_mm)) +
geom_histogram(
bins = 30,
fill = "steelblue",
color = "white"
) +
facet_wrap(~ species) +
geom_label(
data = species_bill_length_stats,
aes(
x = Inf,
y = max_count,
label = label
),
hjust = 1.1,
vjust = 1.1,
inherit.aes = FALSE,
fill = "white",
label.size = 0.3
) +
labs(
title = "Bill Length by Species",
x = "Bill Length (mm)",
y = "Count"
) +
theme_minimal()
## Warning: The `label.size` argument of `geom_label()` is deprecated as of ggplot2 3.5.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
island_bill_length_stats <-
penguins |>
group_by(island) |>
summarize(
mean = mean(bill_length_mm, na.rm = TRUE),
sd = sd(bill_length_mm, na.rm = TRUE),
max_count = max(hist(bill_length_mm, plot = FALSE)$counts),
.groups = "drop"
) |>
mutate(
label = paste0(
"Mean: ", round(mean, 2),
"\nSD: ", round(sd, 2)
)
)
island_bill_length_plot <-
penguins |>
ggplot(aes(x = bill_length_mm)) +
geom_histogram(
bins = 30,
fill = "steelblue",
color = "white"
) +
facet_wrap(~ island) +
geom_label(
data = island_bill_length_stats,
aes(
x = Inf,
y = max_count,
label = label
),
hjust = 1.1,
vjust = 1.1,
inherit.aes = FALSE,
fill = "white",
label.size = 0.3
) +
labs(
title = "Bill Length by Island",
x = "Bill Length (mm)",
y = "Count"
) +
theme_minimal()
library(patchwork)
species_bill_length_plot /
island_bill_length_plot
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).
library(tidyverse)
# Create summary statistics
bill_depth_stats <-
penguins |>
group_by(species) |>
summarize(
mean = mean(bill_depth_mm, na.rm = TRUE),
sd = sd(bill_depth_mm, na.rm = TRUE),
.groups = "drop"
) |>
mutate(
label = paste0(
"Mean: ", round(mean, 2),
"\nSD: ", round(sd, 2)
)
)
# Plot
penguins |>
ggplot(aes(x = bill_depth_mm)) +
geom_histogram(
bins = 30,
fill = "steelblue",
color = "white"
) +
facet_wrap(~ species) +
geom_text(
data = bill_depth_stats,
aes(
x = Inf,
y = Inf,
label = label
),
hjust = 1.1,
vjust = 1.1,
inherit.aes = FALSE
) +
labs(
title = "Bill Depth by Species",
x = "Bill Depth (mm)",
y = "Count"
) +
theme_minimal()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).
library(tidyverse)
# Remove NA values
penguins_clean <-
penguins |>
drop_na(bill_depth_mm, sex)
# Summary statistics by species and sex
bill_depth_stats <-
penguins_clean |>
group_by(species, sex) |>
summarize(
mean = mean(bill_depth_mm),
sd = sd(bill_depth_mm),
.groups = "drop"
) |>
mutate(
label = paste0(
"Mean: ", round(mean, 2),
"\nSD: ", round(sd, 2)
)
)
species_bill_depth <-
penguins_clean |>
ggplot(aes(x = bill_depth_mm, fill = sex)) +
geom_histogram(
bins = 13,
color = "white"
) +
facet_wrap(~ species + sex, ncol = 2) +
geom_text(
data = bill_depth_stats,
aes(
x = Inf,
y = Inf,
label = label
),
hjust = 1.1,
vjust = 1.1,
inherit.aes = FALSE
) +
labs(
title = "Bill Depth by Species and Sex",
x = "Bill Depth (mm)",
y = "Count"
) +
theme_minimal() +
theme(legend.position = "none")
species_bill_depth
library(tidyverse)
# Create summary statistics grouped by island
bill_depth_stats <-
penguins |>
group_by(island) |>
summarize(
mean = mean(bill_depth_mm, na.rm = TRUE),
sd = sd(bill_depth_mm, na.rm = TRUE),
.groups = "drop"
) |>
mutate(
label = paste0(
"Mean: ", round(mean, 2),
"\nSD: ", round(sd, 2)
)
)
# Plot
penguins |>
ggplot(aes(x = bill_depth_mm)) +
geom_histogram(
bins = 30,
fill = "steelblue",
color = "white"
) +
facet_wrap(~ island) +
geom_text(
data = bill_depth_stats,
aes(
x = Inf,
y = Inf,
label = label
),
hjust = 1.1,
vjust = 1.1,
inherit.aes = FALSE
) +
labs(
title = "Bill Depth by Island",
x = "Bill Depth (mm)",
y = "Count"
) +
theme_minimal()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).
library(tidyverse)
# Create summary statistics
flipper_length_stats <-
penguins |>
group_by(species) |>
summarize(
mean = mean(flipper_length_mm, na.rm = TRUE),
sd = sd(flipper_length_mm, na.rm = TRUE),
.groups = "drop"
) |>
mutate(
label = paste0(
"Mean: ", round(mean, 2),
"\nSD: ", round(sd, 2)
)
)
# Plot
penguins |>
ggplot(aes(x = flipper_length_mm)) +
geom_histogram(
bins = 30,
fill = "steelblue",
color = "white"
) +
facet_wrap(~ species) +
geom_text(
data = flipper_length_stats,
aes(
x = Inf,
y = Inf,
label = label
),
hjust = 1.1,
vjust = 1.1,
inherit.aes = FALSE
) +
labs(
title = "Flipper Length by Species",
x = "Flipper Length (mm)",
y = "Count"
) +
theme_minimal()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).
library(tidyverse)
# Create summary statistics grouped by island
flipper_length_stats <-
penguins |>
group_by(island) |>
summarize(
mean = mean(flipper_length_mm, na.rm = TRUE),
sd = sd(flipper_length_mm, na.rm = TRUE),
.groups = "drop"
) |>
mutate(
label = paste0(
"Mean: ", round(mean, 2),
"\nSD: ", round(sd, 2)
)
)
# Plot
penguins |>
ggplot(aes(x = flipper_length_mm)) +
geom_histogram(
bins = 30,
fill = "steelblue",
color = "white"
) +
facet_wrap(~ island) +
geom_text(
data = flipper_length_stats,
aes(
x = Inf,
y = Inf,
label = label
),
hjust = 1.1,
vjust = 1.1,
inherit.aes = FALSE
) +
labs(
title = "Flipper Length by Island",
x = "Flipper Length (mm)",
y = "Count"
) +
theme_minimal()
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).
library(tidyverse)
# Remove NA values
penguins_clean <-
penguins |>
drop_na(bill_length_mm, sex)
# Summary statistics by species and sex
bill_length_stats <-
penguins_clean |>
group_by(species, sex) |>
summarize(
mean = mean(bill_length_mm),
sd = sd(bill_length_mm),
.groups = "drop"
) |>
mutate(
label = paste0(
"Mean: ", round(mean, 2),
"\nSD: ", round(sd, 2)
)
)
species_bill_length <-
penguins_clean |>
ggplot(aes(x = bill_length_mm, fill = sex)) +
geom_histogram(
bins = 20, # <-- changed to 20 bins
color = "white"
) +
facet_wrap(~ species + sex, ncol = 2) +
geom_text(
data = bill_length_stats,
aes(
x = Inf,
y = Inf,
label = label
),
hjust = 1.1,
vjust = 1.1,
inherit.aes = FALSE
) +
labs(
title = "Bill Length by Species and Sex",
x = "Bill Length (mm)",
y = "Count"
) +
theme_minimal() +
theme(legend.position = "none")
species_bill_length
# Clear Linear Association : body_mass_g and flipper_length_mm
plot(penguins$body_mass_g, penguins$flipper_length_mm)
cor(penguins$bill_length_mm, penguins$bill_depth_mm, use = "complete.obs")
## [1] -0.2350529
# Remove missing values
penguins_clean <- na.omit(penguins)
# Use factor levels (ensures real species names)
species_levels <- levels(penguins_clean$species)
species_colors <- c("blue", "darkgreen", "purple")
# Empty plot
plot(
penguins_clean$bill_depth_mm,
penguins_clean$bill_length_mm,
type = "n",
xlab = "Bill Depth (mm)",
ylab = "Bill Length (mm)",
main = "Simpson's Paradox"
)
# Add species points + lines
for (i in seq_along(species_levels)) {
sp <- species_levels[i]
species_data <- subset(
penguins_clean,
species == sp
)
points(
species_data$bill_depth_mm,
species_data$bill_length_mm,
col = species_colors[i],
pch = 16
)
species_fit <- lm(
bill_length_mm ~ bill_depth_mm,
data = species_data
)
abline(
species_fit,
col = species_colors[i],
lwd = 2
)
}
# Global regression line (red)
global_fit <- lm(
bill_length_mm ~ bill_depth_mm,
data = penguins_clean
)
abline(
global_fit,
col = "red",
lwd = 3
)
# Legend with actual species names
legend(
"topleft",
legend = c(species_levels, "Global"),
col = c(species_colors, "red"),
lwd = c(rep(2, length(species_levels)), 3),
pch = c(rep(16, length(species_levels)), NA),
bty = "n"
)
“All models are wrong, but some are useful”
mdl1 <- lm(bill_depth_mm ~ bill_length_mm, dat = penguins)
mdl2 <- lm(bill_depth_mm ~ bill_length_mm + species, dat = penguins)
mdl3 <- lm(bill_depth_mm ~ bill_length_mm + island, dat = penguins)
mdl4 <- lm(bill_depth_mm ~ bill_length_mm + sex, dat = penguins)
mdl5 <- lm(bill_depth_mm ~ bill_length_mm + species + island, dat = penguins)
mdl6 <- lm(bill_depth_mm ~ bill_length_mm + species + sex, dat = penguins)
mdl7 <- lm(bill_depth_mm ~ bill_length_mm + species + island, dat = penguins)
mdl8 <- lm(bill_depth_mm ~ bill_length_mm + species + island + sex, dat = penguins)
summary(mdl1)
##
## Call:
## lm(formula = bill_depth_mm ~ bill_length_mm, data = penguins)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.1381 -1.4263 0.0164 1.3841 4.5255
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.88547 0.84388 24.749 < 2e-16 ***
## bill_length_mm -0.08502 0.01907 -4.459 1.12e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.922 on 340 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.05525, Adjusted R-squared: 0.05247
## F-statistic: 19.88 on 1 and 340 DF, p-value: 1.12e-05
plot(mdl1, which = c(1,2))
summary(mdl2)
##
## Call:
## lm(formula = bill_depth_mm ~ bill_length_mm + species, data = penguins)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.4529 -0.6864 -0.0508 0.5519 3.5915
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.59218 0.68302 15.508 < 2e-16 ***
## bill_length_mm 0.19989 0.01749 11.427 < 2e-16 ***
## speciesChinstrap -1.93319 0.22416 -8.624 2.55e-16 ***
## speciesGentoo -5.10602 0.19142 -26.674 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9533 on 338 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.769, Adjusted R-squared: 0.7669
## F-statistic: 375.1 on 3 and 338 DF, p-value: < 2.2e-16
plot(mdl2, which = c(1,2))
summary(mdl3)
##
## Call:
## lm(formula = bill_depth_mm ~ bill_length_mm + island, data = penguins)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2539 -1.2381 -0.0759 0.8819 5.0908
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 17.41079 0.75453 23.075 <2e-16 ***
## bill_length_mm -0.03394 0.01647 -2.061 0.0401 *
## islandDream 2.43252 0.18189 13.373 <2e-16 ***
## islandTorgersen 2.34053 0.26544 8.818 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.527 on 338 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.4074, Adjusted R-squared: 0.4022
## F-statistic: 77.46 on 3 and 338 DF, p-value: < 2.2e-16
plot(mdl3, which = c(1,2))
summary(mdl4)
##
## Call:
## lm(formula = bill_depth_mm ~ bill_length_mm + sex, data = penguins)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4055 -1.3440 -0.0007 1.1859 4.1253
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.56139 0.76355 29.548 < 2e-16 ***
## bill_length_mm -0.14576 0.01787 -8.155 7.35e-15 ***
## sexmale 2.01334 0.19519 10.315 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.672 on 330 degrees of freedom
## (11 observations deleted due to missingness)
## Multiple R-squared: 0.2833, Adjusted R-squared: 0.279
## F-statistic: 65.23 on 2 and 330 DF, p-value: < 2.2e-16
plot(mdl4, which = c(1,2))
summary(mdl5)
##
## Call:
## lm(formula = bill_depth_mm ~ bill_length_mm + species + island,
## data = penguins)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.4500 -0.6824 -0.0527 0.5737 3.5390
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.59154 0.69947 15.142 < 2e-16 ***
## bill_length_mm 0.19959 0.01756 11.365 < 2e-16 ***
## speciesChinstrap -1.89334 0.25035 -7.563 3.8e-13 ***
## speciesGentoo -5.09080 0.22502 -22.624 < 2e-16 ***
## islandDream -0.02422 0.19274 -0.126 0.900
## islandTorgersen 0.06375 0.19667 0.324 0.746
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9558 on 336 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.7692, Adjusted R-squared: 0.7657
## F-statistic: 223.9 on 5 and 336 DF, p-value: < 2.2e-16
plot(mdl5, which = c(1,2))
summary(mdl6)
##
## Call:
## lm(formula = bill_depth_mm ~ bill_length_mm + species + sex,
## data = penguins)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.10170 -0.54971 -0.07144 0.49233 2.92621
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.07161 0.72264 20.856 < 2e-16 ***
## bill_length_mm 0.06824 0.01942 3.514 0.000504 ***
## speciesChinstrap -0.60971 0.22855 -2.668 0.008015 **
## speciesGentoo -3.96308 0.19686 -20.132 < 2e-16 ***
## sexmale 1.25285 0.11489 10.904 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8188 on 328 degrees of freedom
## (11 observations deleted due to missingness)
## Multiple R-squared: 0.8292, Adjusted R-squared: 0.8271
## F-statistic: 398.1 on 4 and 328 DF, p-value: < 2.2e-16
plot(mdl6, which = c(1,2))
summary(mdl7)
##
## Call:
## lm(formula = bill_depth_mm ~ bill_length_mm + species + island,
## data = penguins)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.4500 -0.6824 -0.0527 0.5737 3.5390
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.59154 0.69947 15.142 < 2e-16 ***
## bill_length_mm 0.19959 0.01756 11.365 < 2e-16 ***
## speciesChinstrap -1.89334 0.25035 -7.563 3.8e-13 ***
## speciesGentoo -5.09080 0.22502 -22.624 < 2e-16 ***
## islandDream -0.02422 0.19274 -0.126 0.900
## islandTorgersen 0.06375 0.19667 0.324 0.746
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9558 on 336 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.7692, Adjusted R-squared: 0.7657
## F-statistic: 223.9 on 5 and 336 DF, p-value: < 2.2e-16
plot(mdl7, which = c(1,2))
summary(mdl8)
##
## Call:
## lm(formula = bill_depth_mm ~ bill_length_mm + species + island +
## sex, data = penguins)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0037 -0.5569 -0.0817 0.4990 2.9183
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 15.14954 0.73540 20.600 < 2e-16 ***
## bill_length_mm 0.06647 0.01949 3.411 0.00073 ***
## speciesChinstrap -0.49350 0.25050 -1.970 0.04968 *
## speciesGentoo -3.96088 0.22056 -17.959 < 2e-16 ***
## islandDream -0.11167 0.16600 -0.673 0.50160
## islandTorgersen 0.08981 0.17189 0.522 0.60169
## sexmale 1.26059 0.11515 10.947 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8194 on 326 degrees of freedom
## (11 observations deleted due to missingness)
## Multiple R-squared: 0.83, Adjusted R-squared: 0.8269
## F-statistic: 265.3 on 6 and 326 DF, p-value: < 2.2e-16
plot(mdl8, which = c(1,2))
colnames(penguins)
## [1] "species" "island" "bill_length_mm"
## [4] "bill_depth_mm" "flipper_length_mm" "body_mass_g"
## [7] "sex" "year"
penguins_clean <- penguins |>
drop_na(bill_depth_mm, bill_length_mm, species, sex)
mdl2_new <- lm(bill_depth_mm ~ bill_length_mm + species, data = penguins_clean)
mdl6_new <- lm(bill_depth_mm ~ bill_length_mm + species + sex, data = penguins_clean)
anova(mdl2_new, mdl6_new)
## Analysis of Variance Table
##
## Model 1: bill_depth_mm ~ bill_length_mm + species
## Model 2: bill_depth_mm ~ bill_length_mm + species + sex
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 329 299.62
## 2 328 219.90 1 79.718 118.91 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(dplyr)
unique(penguins$species)
## [1] Adelie Gentoo Chinstrap
## Levels: Adelie Chinstrap Gentoo
unique(penguins$island)
## [1] Torgersen Biscoe Dream
## Levels: Biscoe Dream Torgersen
tbl_species_island <- table(penguins$species, penguins$island)
addmargins(tbl_species_island)
##
## Biscoe Dream Torgersen Sum
## Adelie 44 56 52 152
## Chinstrap 0 68 0 68
## Gentoo 124 0 0 124
## Sum 168 124 52 344
chisq.test(tbl_species_island)
##
## Pearson's Chi-squared test
##
## data: tbl_species_island
## X-squared = 299.55, df = 4, p-value < 2.2e-16
unique(penguins$sex)
## [1] male female <NA>
## Levels: female male
unique(penguins$species)
## [1] Adelie Gentoo Chinstrap
## Levels: Adelie Chinstrap Gentoo
tbl_sex_species <- table(penguins$sex, penguins$species)
addmargins(tbl_sex_species)
##
## Adelie Chinstrap Gentoo Sum
## female 73 34 58 165
## male 73 34 61 168
## Sum 146 68 119 333
chisq.test(tbl_sex_species)
##
## Pearson's Chi-squared test
##
## data: tbl_sex_species
## X-squared = 0.048607, df = 2, p-value = 0.976
library(ggplot2)
penguins |> ggplot(aes(bill_depth_mm)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).
penguins |> ggplot(aes(bill_depth_mm)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_bin()`).
numeric <-
penguins |>
select(where(is.numeric)) |>
select(-year)
plot(numeric)
set.seed(123)
set.seed(123)
x <- rnorm(n = 400, mean = 10, sd = 2)
y <- 2*x + 4
df <- data.frame(x=x,y=y)
head(df, 5)
## x y
## 1 8.879049 21.75810
## 2 9.539645 23.07929
## 3 13.117417 30.23483
## 4 10.141017 24.28203
## 5 10.258575 24.51715
plot(x,y)
# write.csv(df, "simulated_regression_example1.csv")
rand_noise_normal <- rnorm(n = 400, mean = 0, sd = 1)
x <- rnorm(n = 400, mean = 10, sd = 2)
y <- 2*x + 4 + rand_noise_normal
hist(y)
plot(x,y)
set.seed(123)
# Generate x
x <- rnorm(n = 400, mean = 10, sd = 2)
# One-direction V-shaped noise (grows as x increases)
noise <- rnorm(400, mean = 0, sd = (x - min(x)))
# Generate y
y <- 2*x + 4 + noise
# Plot
plot(x, y)
abline(lm(y ~ x), col = "red")
set.seed(123)
# Generate x
x <- rnorm(n = 400, mean = 10, sd = 2)
# One-direction V-shaped noise (grows as x decreases)
noise <- rnorm(400, mean = 0, sd = (max(x) - x))
# Generate y
y <- 2*x + 4 + noise
# Plot
plot(x, y)
abline(lm(y ~ x), col = "red")
x <- rexp(400, rate = 1)
hist(x)
y <- 2*x + 4
hist(y)
plot(x,y)
x <- -rexp(400, rate = 1)
hist(x)
y <- 2*x + 4
hist(y)
plot(x,y)