Regression plots

#Question 1
library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

beta <- 3.5      # True slope parameter (Y = beta * X)
sigma <- 1.2     # Standard deviation of errors
x_min <- 0       
x_max <- 6       
x_step <- 1.5    
curve_spread <- 0.4  
x_vals <- seq(x_min, x_max, by = x_step)
dist_data <- lapply(x_vals, function(x0) {
  
  y_mean <- beta * x0  #Center of the distribution
  yseq <- seq(y_mean - 3*sigma, y_mean + 3*sigma, length.out = 100)
  
  
  dens <- dnorm(yseq, mean = y_mean, sd = sigma)
  
  
  data.frame(x = x0 + dens * curve_spread,
             y = yseq,
             x_original = x0)
}) %>% bind_rows()


line_data <- data.frame(x = seq(x_min - 1, x_max + 1, length.out = 200))
line_data$y <- beta * line_data$x

set.seed(123)  
sample_data <- data.frame(
  x = runif(30, x_min, x_max),
  y = beta * runif(30, x_min, x_max) + rnorm(30, 0, sigma)
)

ggplot() +
  
  geom_path(data = dist_data, 
            aes(x = x, y = y, group = factor(x_original)), 
            color = "darkblue", 
            linewidth = 0.8,
            alpha = 0.7) +
  
  geom_line(data = line_data, 
            aes(x = x, y = y), 
            color = "red", 
            linewidth = 1.5) +
  
  geom_point(data = sample_data,
             aes(x = x, y = y),
             color = "forestgreen",
             alpha = 0.6,
             size = 2) +

  geom_point(data = data.frame(x = x_vals, y = beta * x_vals),
             aes(x = x, y = y), 
             color = "red", 
             size = 4, 
             shape = 18) +
  
  labs(x = "Predictor Variable (X)", 
       y = "Response Variable (Y)",
       title = sprintf("No-Intercept Regression: Y = %.1fX + εrror", beta),
       subtitle = sprintf("Vertical curves show Y|X ~ N(%.1fX, %.1f²)", beta, sigma),
       caption = sprintf("Red line: True regression line | Curve spread: σ = %.1f", sigma)) +
  
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 14),
        plot.subtitle = element_text(size = 11),
        panel.grid.major = element_line(color = "gray90"),
        panel.grid.minor = element_blank()) +
  
  coord_cartesian(xlim = c(x_min - 0.5, x_max + 0.5), 
                  ylim = c(beta * x_min - 3*sigma, beta * x_max + 3*sigma)) +
  annotate("text", x = mean(x_vals), y = beta * x_max + 2*sigma, 
           label = sprintf("Y = %.1fX", beta), 
           color = "red", fontface = "bold", size = 5) +
  annotate("text", x = max(x_vals) - 0.5, y = beta * x_min + 1, 
           label = "Each curve shows the distribution\nof possible Y values at that X", 
           color = "darkblue", size = 3.5)

# Here,All distributions have the same spread, showing homoscedasticity. The expected value of Y given X is the red line
#The actual observed values of Y are random samples from normal distributions centered at the blue curves

#Question2

library(ggplot2)
library(dplyr)

beta0 <- 5       # The true intercept (mean of Y)
sigma <- 1.5     # Standard deviation of errors
n_curves <- 6    


# Since X doesn't matter, we'll just create curves at arbitrary X locations for demonstration.
x_vals <- seq(1, n_curves, by = 1)

dist_data <- lapply(x_vals, function(x0) {
  #The mean is beta0, regardless of the X value
  y_mean <- beta0
  yseq <- seq(y_mean - 3*sigma, y_mean + 3*sigma, length.out = 100)
  dens <- dnorm(yseq, mean = y_mean, sd = sigma)
  
  data.frame(x = x0 + dens * 0.4,  
             y = yseq,
             x_original = x0)
}) %>% bind_rows()


line_data <- data.frame(x = c(0, n_curves + 1)) 
line_data$y <- beta0

set.seed(123)
sample_data <- data.frame(
  #X is meaningless in this model, so we just generate random X for plotting
  x = runif(40, 0.5, n_curves + 0.5), 
  #Y is generated from the true model:Y = beta0 + error
  y = beta0 + rnorm(40, 0, sigma) 
)

ggplot() +
 
  geom_path(data = dist_data, 
            aes(x = x, y = y, group = factor(x_original)), 
            color = "darkblue", 
            linewidth = 0.8,
            alpha = 0.7) +
  
  geom_hline(yintercept = beta0, 
             color = "red", 
             linewidth = 1.5) +
  
  geom_point(data = sample_data,
             aes(x = x, y = y),
             color = "forestgreen",
             alpha = 0.6,
             size = 2.5) +
  
  geom_point(data = data.frame(x = x_vals, y = beta0),
             aes(x = x, y = y), 
             color = "red", 
             size = 4, 
             shape = 18) +
  
  labs(x = "Predictor Variable(X)- Has No Effect", 
       y = "Response Variable(Y)",
       title = sprintf("Constant Mean Model: Y = %.1f + error", beta0),
       subtitle = sprintf("Y is always distributed around its mean: Y ~ N(%.1f, %.1f²)", beta0, sigma),
       caption = sprintf("The predictor X is irrelevant. All distributions are identical.")) +
  
  
  theme_minimal() +
  theme(plot.title = element_text(face = "bold", size = 14),
        plot.subtitle = element_text(size = 11),
        panel.grid.major = element_line(color = "gray90"),
        panel.grid.minor = element_blank(),
        axis.text.x = element_blank()) +
  
  coord_cartesian(xlim = c(0.5, n_curves + 0.5), 
                  ylim = c(beta0 - 3.5*sigma, beta0 + 3.5*sigma)) +
  
 
  annotate("text", x = mean(x_vals), y = beta0 + 2.5*sigma, 
           label = sprintf("E[Y] = %.1f", beta0), 
           color = "red", fontface = "bold", size = 5)

#This plot visualizes the concept of no relationship between X and Y.The best prediction for Y is always its overall mean.

Regression plots

Sabuj Ganguly

2025-09-18