#Question 1
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
beta <- 3.5 # True slope parameter (Y = beta * X)
sigma <- 1.2 # Standard deviation of errors
x_min <- 0
x_max <- 6
x_step <- 1.5
curve_spread <- 0.4
x_vals <- seq(x_min, x_max, by = x_step)
dist_data <- lapply(x_vals, function(x0) {
y_mean <- beta * x0 #Center of the distribution
yseq <- seq(y_mean - 3*sigma, y_mean + 3*sigma, length.out = 100)
dens <- dnorm(yseq, mean = y_mean, sd = sigma)
data.frame(x = x0 + dens * curve_spread,
y = yseq,
x_original = x0)
}) %>% bind_rows()
line_data <- data.frame(x = seq(x_min - 1, x_max + 1, length.out = 200))
line_data$y <- beta * line_data$x
set.seed(123)
sample_data <- data.frame(
x = runif(30, x_min, x_max),
y = beta * runif(30, x_min, x_max) + rnorm(30, 0, sigma)
)
ggplot() +
geom_path(data = dist_data,
aes(x = x, y = y, group = factor(x_original)),
color = "darkblue",
linewidth = 0.8,
alpha = 0.7) +
geom_line(data = line_data,
aes(x = x, y = y),
color = "red",
linewidth = 1.5) +
geom_point(data = sample_data,
aes(x = x, y = y),
color = "forestgreen",
alpha = 0.6,
size = 2) +
geom_point(data = data.frame(x = x_vals, y = beta * x_vals),
aes(x = x, y = y),
color = "red",
size = 4,
shape = 18) +
labs(x = "Predictor Variable (X)",
y = "Response Variable (Y)",
title = sprintf("No-Intercept Regression: Y = %.1fX + εrror", beta),
subtitle = sprintf("Vertical curves show Y|X ~ N(%.1fX, %.1f²)", beta, sigma),
caption = sprintf("Red line: True regression line | Curve spread: σ = %.1f", sigma)) +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14),
plot.subtitle = element_text(size = 11),
panel.grid.major = element_line(color = "gray90"),
panel.grid.minor = element_blank()) +
coord_cartesian(xlim = c(x_min - 0.5, x_max + 0.5),
ylim = c(beta * x_min - 3*sigma, beta * x_max + 3*sigma)) +
annotate("text", x = mean(x_vals), y = beta * x_max + 2*sigma,
label = sprintf("Y = %.1fX", beta),
color = "red", fontface = "bold", size = 5) +
annotate("text", x = max(x_vals) - 0.5, y = beta * x_min + 1,
label = "Each curve shows the distribution\nof possible Y values at that X",
color = "darkblue", size = 3.5)

# Here,All distributions have the same spread, showing homoscedasticity. The expected value of Y given X is the red line
#The actual observed values of Y are random samples from normal distributions centered at the blue curves
#Question2
library(ggplot2)
library(dplyr)
beta0 <- 5 # The true intercept (mean of Y)
sigma <- 1.5 # Standard deviation of errors
n_curves <- 6
# Since X doesn't matter, we'll just create curves at arbitrary X locations for demonstration.
x_vals <- seq(1, n_curves, by = 1)
dist_data <- lapply(x_vals, function(x0) {
#The mean is beta0, regardless of the X value
y_mean <- beta0
yseq <- seq(y_mean - 3*sigma, y_mean + 3*sigma, length.out = 100)
dens <- dnorm(yseq, mean = y_mean, sd = sigma)
data.frame(x = x0 + dens * 0.4,
y = yseq,
x_original = x0)
}) %>% bind_rows()
line_data <- data.frame(x = c(0, n_curves + 1))
line_data$y <- beta0
set.seed(123)
sample_data <- data.frame(
#X is meaningless in this model, so we just generate random X for plotting
x = runif(40, 0.5, n_curves + 0.5),
#Y is generated from the true model:Y = beta0 + error
y = beta0 + rnorm(40, 0, sigma)
)
ggplot() +
geom_path(data = dist_data,
aes(x = x, y = y, group = factor(x_original)),
color = "darkblue",
linewidth = 0.8,
alpha = 0.7) +
geom_hline(yintercept = beta0,
color = "red",
linewidth = 1.5) +
geom_point(data = sample_data,
aes(x = x, y = y),
color = "forestgreen",
alpha = 0.6,
size = 2.5) +
geom_point(data = data.frame(x = x_vals, y = beta0),
aes(x = x, y = y),
color = "red",
size = 4,
shape = 18) +
labs(x = "Predictor Variable(X)- Has No Effect",
y = "Response Variable(Y)",
title = sprintf("Constant Mean Model: Y = %.1f + error", beta0),
subtitle = sprintf("Y is always distributed around its mean: Y ~ N(%.1f, %.1f²)", beta0, sigma),
caption = sprintf("The predictor X is irrelevant. All distributions are identical.")) +
theme_minimal() +
theme(plot.title = element_text(face = "bold", size = 14),
plot.subtitle = element_text(size = 11),
panel.grid.major = element_line(color = "gray90"),
panel.grid.minor = element_blank(),
axis.text.x = element_blank()) +
coord_cartesian(xlim = c(0.5, n_curves + 0.5),
ylim = c(beta0 - 3.5*sigma, beta0 + 3.5*sigma)) +
annotate("text", x = mean(x_vals), y = beta0 + 2.5*sigma,
label = sprintf("E[Y] = %.1f", beta0),
color = "red", fontface = "bold", size = 5)

#This plot visualizes the concept of no relationship between X and Y.The best prediction for Y is always its overall mean.