Introduction

Questions

Varaition

Visualising distributions

# Create your data
data <- tibble::tibble(
  age = c(25, 25, 25, 33, 15),
  height = as.numeric(c(NA, 196, 190, 173, 149)),
  weight = as.numeric(c(NA, 86, 90, 73, 45))
)

# Bar chart of age counts (age treated as a category)
ggplot(data, aes(x = factor(age))) +
  geom_bar() +
  labs(title = "Count of Records by Age", x = "Age", y = "Count")

# Histogram of height (numerical, NA removed)
ggplot(data, aes(x = height)) +
  geom_histogram(binwidth = 5, na.rm = TRUE) +
  labs(title = "Distribution of Heights", x = "Height", y = "Count")

# Filter out rows where weight is missing
filtered_data <- data %>% filter(!is.na(weight))

# Frequency polygon of weight by age
ggplot(filtered_data, aes(x = weight, colour = factor(age))) +
  geom_freqpoly(binwidth = 5) +
  labs(title = "Weight Distribution by Age", x = "Weight", y = "Frequency", colour = "Age")

Typical values

# Your dataset
data <- tibble::tibble(
  age = c(25, 25, 25, 33, 15),
  height = as.numeric(c(NA, 196, 190, 173, 149)),
  weight = as.numeric(c(NA, 86, 90, 73, 45))
)

# Histogram of height (like the diamonds carat example)
ggplot(data = data, mapping = aes(x = height)) +
  geom_histogram(binwidth = 1, na.rm = TRUE) +
  labs(title = "Distribution of Heights", x = "Height (cm)", y = "Count")

# Histogram of weight (like the faithful eruptions example)
ggplot(data = data, mapping = aes(x = weight)) +
  geom_histogram(binwidth = 5, na.rm = TRUE) +
  labs(title = "Distribution of Weights", x = "Weight (kg)", y = "Count")

Unusual values

library(ggplot2)

# Define your data
df <- data.frame(
  age = c(25, 25, 25, 33, 15),
  height = c(NA, 196, 190, 173, 149),
  weight = c(NA, 86, 90, 73, 45)
)

# Plot histogram of height
ggplot(data = df) + 
  geom_histogram(mapping = aes(x = height), binwidth = 5, na.rm = TRUE) +
  coord_cartesian(ylim = c(0, 2)) +
  labs(title = "Histogram of Height", x = "Height (cm)", y = "Count")

Missing Values

diamonds2 <- diamonds %>% 
  mutate(y = ifelse(y < 3 | y > 20, NA, y))

# Plot 
ggplot(data = diamonds2, mapping = aes(x = x, y = y)) + 
  geom_point()
## Warning: Removed 9 rows containing missing values or values outside the scale range
## (`geom_point()`).

#> Warning: Removed 9 rows containing missing values or values outside the scale range
#> (`geom_point()`).

Covariation

Categorical variable and continuous variable

ggplot(data = df, mapping = aes(x = factor(age), y = weight)) +
  geom_boxplot(na.rm = TRUE) +
  labs(title = "Weight by Age Group", x = "Age", y = "Weight (kg)")

Two categorical variables

library(dplyr)

# Clean and count combinations
df %>%
  filter(!is.na(age), !is.na(weight)) %>%
  count(age, weight) %>%
  ggplot(aes(x = factor(age), y = factor(weight), fill = n)) +
  geom_tile(color = "white") +
  labs(title = "Count of Age and Weight Combinations",
       x = "Age", y = "Weight (kg)") +
  scale_fill_gradient(low = "lightblue", high = "darkblue")

Two continuous variables

library(hexbin)

df %>%
  filter(!is.na(height), !is.na(weight)) %>%
  ggplot(aes(x = height, y = weight)) +
  geom_hex() +
  labs(title = "Hexbin Plot of Height vs Weight")

Patterns and models

library(modelr)

df_clean <- df %>%
  filter(!is.na(height), !is.na(weight))

mod <- lm(log(weight) ~ log(height), data = df_clean)

df2 <- df_clean %>%
  add_residuals(mod) %>%
  mutate(resid = exp(resid))