Varaition
Visualising distributions
# Create your data
data <- tibble::tibble(
age = c(25, 25, 25, 33, 15),
height = as.numeric(c(NA, 196, 190, 173, 149)),
weight = as.numeric(c(NA, 86, 90, 73, 45))
)
# Bar chart of age counts (age treated as a category)
ggplot(data, aes(x = factor(age))) +
geom_bar() +
labs(title = "Count of Records by Age", x = "Age", y = "Count")

# Histogram of height (numerical, NA removed)
ggplot(data, aes(x = height)) +
geom_histogram(binwidth = 5, na.rm = TRUE) +
labs(title = "Distribution of Heights", x = "Height", y = "Count")

# Filter out rows where weight is missing
filtered_data <- data %>% filter(!is.na(weight))
# Frequency polygon of weight by age
ggplot(filtered_data, aes(x = weight, colour = factor(age))) +
geom_freqpoly(binwidth = 5) +
labs(title = "Weight Distribution by Age", x = "Weight", y = "Frequency", colour = "Age")

Typical values
# Your dataset
data <- tibble::tibble(
age = c(25, 25, 25, 33, 15),
height = as.numeric(c(NA, 196, 190, 173, 149)),
weight = as.numeric(c(NA, 86, 90, 73, 45))
)
# Histogram of height (like the diamonds carat example)
ggplot(data = data, mapping = aes(x = height)) +
geom_histogram(binwidth = 1, na.rm = TRUE) +
labs(title = "Distribution of Heights", x = "Height (cm)", y = "Count")

# Histogram of weight (like the faithful eruptions example)
ggplot(data = data, mapping = aes(x = weight)) +
geom_histogram(binwidth = 5, na.rm = TRUE) +
labs(title = "Distribution of Weights", x = "Weight (kg)", y = "Count")

Unusual values
library(ggplot2)
# Define your data
df <- data.frame(
age = c(25, 25, 25, 33, 15),
height = c(NA, 196, 190, 173, 149),
weight = c(NA, 86, 90, 73, 45)
)
# Plot histogram of height
ggplot(data = df) +
geom_histogram(mapping = aes(x = height), binwidth = 5, na.rm = TRUE) +
coord_cartesian(ylim = c(0, 2)) +
labs(title = "Histogram of Height", x = "Height (cm)", y = "Count")

Missing Values
diamonds2 <- diamonds %>%
mutate(y = ifelse(y < 3 | y > 20, NA, y))
# Plot
ggplot(data = diamonds2, mapping = aes(x = x, y = y)) +
geom_point()
## Warning: Removed 9 rows containing missing values or values outside the scale range
## (`geom_point()`).

#> Warning: Removed 9 rows containing missing values or values outside the scale range
#> (`geom_point()`).
Covariation
Categorical variable and continuous variable
ggplot(data = df, mapping = aes(x = factor(age), y = weight)) +
geom_boxplot(na.rm = TRUE) +
labs(title = "Weight by Age Group", x = "Age", y = "Weight (kg)")

Two categorical variables
library(dplyr)
# Clean and count combinations
df %>%
filter(!is.na(age), !is.na(weight)) %>%
count(age, weight) %>%
ggplot(aes(x = factor(age), y = factor(weight), fill = n)) +
geom_tile(color = "white") +
labs(title = "Count of Age and Weight Combinations",
x = "Age", y = "Weight (kg)") +
scale_fill_gradient(low = "lightblue", high = "darkblue")

Two continuous variables
library(hexbin)
df %>%
filter(!is.na(height), !is.na(weight)) %>%
ggplot(aes(x = height, y = weight)) +
geom_hex() +
labs(title = "Hexbin Plot of Height vs Weight")

Patterns and models
library(modelr)
df_clean <- df %>%
filter(!is.na(height), !is.na(weight))
mod <- lm(log(weight) ~ log(height), data = df_clean)
df2 <- df_clean %>%
add_residuals(mod) %>%
mutate(resid = exp(resid))