---
title: "Regression: Error analysis/ Data"
format:
html:
theme:
light: cosmo
dark: darkly
toc: true
toc-depth: 3
number-sections: true
code-fold: true
code-tools: true
smooth-scroll: true
anchor-sections: true
fontsize: 1.05em
css: styles.css
execute:
echo: true
warning: false
message: false
cache: false
editor: visual
---
```{r}
# Load data
data <- read.csv("SM_data.csv")
# Fit regression model
model <- lm(H ~ AL + PL + FL, data = data)
# Summary
summary(model)
# -----------------------------
# Residual diagnostics
# -----------------------------
# Raw residuals
res <- residuals(model)
# Standardized residuals
std_res <- rstandard(model)
# Studentized residuals (internal)
stud_res <- rstudent(model)
# Studentized deleted residuals (external)
stud_del_res <- rstudent(model)
# Add to dataset
data$residuals <- res
data$std_res <- std_res
data$stud_res <- stud_res
data$stud_del_res <- stud_del_res
# -----------------------------
# Outlier detection criteria
# -----------------------------
# Thresholds
n <- nrow(data)
p <- length(coef(model))
# Common cutoffs
std_cut <- 2
stud_cut <- 2
stud_del_cut <- 2
# Flag outliers
data$outlier_flag <- ifelse(
abs(std_res) > std_cut |
abs(stud_res) > stud_cut |
abs(stud_del_res) > stud_del_cut,
"Outlier", "Normal"
)
par(mfrow=c(2,2))
# -----------------------------
# Plot: Residuals vs Fitted
# -----------------------------
plot(fitted(model), res,
col = ifelse(data$outlier_flag == "Outlier", "red", "black"),
pch = 19,
xlab = "Fitted Values",
ylab = "Residuals",
main = "Residuals vs Fitted")
abline(h = 0, lty = 2)
# -----------------------------
# Plot: Standardized Residuals
# -----------------------------
plot(fitted(model), std_res,
col = ifelse(abs(std_res) > std_cut, "red", "black"),
pch = 19,
xlab = "Fitted Values",
ylab = "Standardized Residuals",
main = "Standardized Residuals")
abline(h = c(-2, 2), col = "blue", lty = 2)
# -----------------------------
# Plot: Studentized Residuals
# -----------------------------
plot(fitted(model), stud_res,
col = ifelse(abs(stud_res) > stud_cut, "red", "black"),
pch = 19,
xlab = "Fitted Values",
ylab = "Studentized Residuals",
main = "Studentized Residuals")
abline(h = c(-2, 2), col = "blue", lty = 2)
# -----------------------------
# Plot: Studentized Deleted Residuals
# -----------------------------
plot(fitted(model), stud_del_res,
col = ifelse(abs(stud_del_res) > stud_del_cut, "red", "black"),
pch = 19,
xlab = "Fitted Values",
ylab = "Studentized Deleted Residuals",
main = "Externally Studentized Residuals")
abline(h = c(-2, 2), col = "blue", lty = 2)
# -----------------------------
# Print detected outliers
# -----------------------------
outliers <- data[data$outlier_flag == "Outlier", ]
print(outliers)
# -----------------------------
# Remove outliers
# -----------------------------
data_clean <- data[data$outlier_flag == "Normal", ]
# -----------------------------
# Refit model without outliers
# -----------------------------
model_clean <- lm(H ~ AL + PL + FL, data = data_clean)
# -----------------------------
# Compare coefficients
# -----------------------------
coef_before <- coef(model)
coef_after <- coef(model_clean)
comparison <- data.frame(
Coefficient = names(coef_before),
With_Outliers = coef_before,
Without_Outliers = coef_after,
Difference = coef_after - coef_before
)
print("Comparison of Regression Coefficients:")
print(comparison)
# -----------------------------
# Compare model summaries
# -----------------------------
cat("\n--- Model WITH Outliers ---\n")
print(summary(model))
cat("\n--- Model WITHOUT Outliers ---\n")
print(summary(model_clean))
# -----------------------------
# Plot comparison (optional)
# -----------------------------
par(mfrow=c(1,2))
# Before
plot(fitted(model), residuals(model),
main = "With Outliers",
col = ifelse(data$outlier_flag == "Outlier", "red", "black"),
pch = 19,
xlab = "Fitted", ylab = "Residuals")
abline(h=0, lty=2)
# After
plot(fitted(model_clean), residuals(model_clean),
main = "Without Outliers",
col = "black",
pch = 19,
xlab = "Fitted", ylab = "Residuals")
abline(h=0, lty=2)
# -----------------------------
# Count removed observations
# -----------------------------
cat("\nNumber of observations removed:",
nrow(data) - nrow(data_clean), "\n")
```