Read in Anscombe Dataset (publicly available in R) and call relevant libraries

# Load required libraries
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(patchwork)  # For combining plots

# Reshape Anscombe's quartet into long format
data(anscombe)
df <- anscombe

# Convert to long format for easier plotting
anscombe_long <- tibble(
  x = c(df$x1, df$x2, df$x3, df$x4),
  y = c(df$y1, df$y2, df$y3, df$y4),
  set = rep(c("Set 1", "Set 2", "Set 3", "Set 4"), each = 11)
)

Figure 5.1: Scatterplots of the Four Datasets and Linear Regression line

# Create base plot function
plot_lm <- function(data, set_name) {
  ggplot(data, aes(x = x, y = y)) +
    geom_point(shape = 16, color = "black") +
    geom_smooth(method = "lm", se = FALSE, color = "blue") +
    labs(title = paste(" ", set_name), x = "x", y = "y") +
    theme_minimal()
}

# Split by set and make plots
plots <- anscombe_long %>%
  group_split(set) %>%
  purrr::map2(unique(anscombe_long$set), ~plot_lm(.x, .y))

# Arrange in 2x2 panel
(plots[[1]] | plots[[2]]) / (plots[[3]] | plots[[4]])
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'
## `geom_smooth()` using formula = 'y ~ x'

Figure 5.2: Loess Regression Plots

(Notice error messages indicate no loess plot for Data Set 4 because of so many redundant x values)

ggplot(anscombe_long, aes(x = x, y = y)) +
  geom_point(size = 2) +
  geom_smooth(method = "loess", se = FALSE, color = "darkred", linewidth = 1) +
  facet_wrap(~set, nrow = 2) +
  labs(title = "Anscombe: Loess Smoothing", x = "x", y = "y") +
  theme_minimal(base_size = 14)
## `geom_smooth()` using formula = 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : at 7.945
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : radius 0.003025
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : all data on boundary of neighborhood. make span bigger
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 7.945
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 0.055
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 122.21
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : zero-width neighborhood. make span bigger
## Warning: Failed to fit group -1.
## Caused by error in `predLoess()`:
## ! NA/NaN/Inf in foreign function call (arg 5)

Figure 5.3 Loess Regression Example

Read in Simulated data predicting Enhancement Reasons from Alcohol Consumption

expdata <- read.table("ahbloessexampleSIM.txt",header=T)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
describe(expdata)
##         vars    n  mean   sd median trimmed  mad min max range  skew kurtosis
## enhance    1 1904 13.89 4.01     14   14.09 4.45   4  21    17 -0.40    -0.39
## alcohol    2 1904  5.14 2.41      5    5.07 2.97   1  10     9  0.26    -0.77
##           se
## enhance 0.09
## alcohol 0.06

Figure 5.3: Loess Regression Plot

#Install lavaan if needed
if (!require("cowplot", character.only = TRUE)) {
  install.packages("cowplot")
  library(cowplot, character.only = TRUE)
}
## Loading required package: cowplot
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:patchwork':
## 
##     align_plots
library(cowplot)
ggplot(expdata, aes(x=alcohol, y=enhance, se=TRUE))+geom_jitter()+
  stat_smooth()+
  background_grid(major = c("xy"),
                  minor = c("none"), size.major = 0.2,
                  colour.major = "grey90", colour.minor = "black98")
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'

# Figure 5.4 First Anscombe Dataset with Confidence Interval

# Extract the first dataset (x1 and y1)
anscombe_df <- data.frame(x = anscombe$x1, y = anscombe$y1)
ggplot(anscombe_df, aes(x = x, y = y)) +
  geom_point(color = "black") +
  geom_smooth(method = "lm", se = TRUE, color = "blue", fill = "lightblue", formula = y ~ x) +
  labs(
    title = "Linear Regression with 95% Confidence Band",
    x = "x1",
    y = "y1"
  ) +
  theme_minimal()

Figure 5.5: Hat (Influence) Diagnostic Plot for the Anscombe Datasets

# Calculate centroid (mean x, mean y) for each group
centroids <- anscombe_long %>%
  group_by(set) %>%
  summarize(cx = mean(x), cy = mean(y))

# Join centroids to original data
df_centroid <- anscombe_long %>%
  left_join(centroids, by = "set")

# Plot with segments to centroid
ggplot(df_centroid, aes(x = x, y = y)) +
  geom_segment(aes(xend = cx, yend = cy), alpha = 0.4, color = "gray") +
  geom_point(size = 2, color = "black") +
  geom_point(aes(x = cx, y = cy), color = "red", size = 3, shape = 16) +  # red dot
  facet_wrap(~set, nrow = 2) +
  labs(title = "Anscombe: Deviation from Centroid", x = "x", y = "y") +
  scale_x_continuous(breaks = c(4, 8, 12, 16, 20)) +
  scale_y_continuous(breaks = c(3, 5, 7, 9, 11, 13)) +
  theme_minimal(base_size = 14)