Regression: Error analysis/ Data

Code
# Load data
data <- read.csv("SM_data.csv")

# Fit regression model
model <- lm(H ~ AL + PL + FL, data = data)

# Summary
summary(model)

Call:
lm(formula = H ~ AL + PL + FL, data = data)

Residuals:
    Min      1Q  Median      3Q     Max 
-31.797  -6.035   1.201   5.402  18.234 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 141.09867    7.66687  18.404  < 2e-16 ***
AL           -0.05856    0.08333  -0.703  0.48453    
PL            2.66508    0.77852   3.423  0.00104 ** 
FL           -0.81509    0.66243  -1.230  0.22265    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 9.792 on 70 degrees of freedom
Multiple R-squared:  0.2608,    Adjusted R-squared:  0.2291 
F-statistic: 8.231 on 3 and 70 DF,  p-value: 9.138e-05
Code
# -----------------------------
# Residual diagnostics
# -----------------------------

# Raw residuals
res <- residuals(model)

# Standardized residuals
std_res <- rstandard(model)

# Studentized residuals (internal)
stud_res <- rstudent(model)

# Studentized deleted residuals (external)
stud_del_res <- rstudent(model)

# Add to dataset
data$residuals <- res
data$std_res <- std_res
data$stud_res <- stud_res
data$stud_del_res <- stud_del_res

# -----------------------------
# Outlier detection criteria
# -----------------------------

# Thresholds
n <- nrow(data)
p <- length(coef(model))

# Common cutoffs
std_cut <- 2
stud_cut <- 2
stud_del_cut <- 2

# Flag outliers
data$outlier_flag <- ifelse(
  abs(std_res) > std_cut | 
    abs(stud_res) > stud_cut | 
    abs(stud_del_res) > stud_del_cut,
  "Outlier", "Normal"
)
par(mfrow=c(2,2))
# -----------------------------
# Plot: Residuals vs Fitted
# -----------------------------
plot(fitted(model), res,
     col = ifelse(data$outlier_flag == "Outlier", "red", "black"),
     pch = 19,
     xlab = "Fitted Values",
     ylab = "Residuals",
     main = "Residuals vs Fitted")

abline(h = 0, lty = 2)

# -----------------------------
# Plot: Standardized Residuals
# -----------------------------
plot(fitted(model), std_res,
     col = ifelse(abs(std_res) > std_cut, "red", "black"),
     pch = 19,
     xlab = "Fitted Values",
     ylab = "Standardized Residuals",
     main = "Standardized Residuals")

abline(h = c(-2, 2), col = "blue", lty = 2)

# -----------------------------
# Plot: Studentized Residuals
# -----------------------------
plot(fitted(model), stud_res,
     col = ifelse(abs(stud_res) > stud_cut, "red", "black"),
     pch = 19,
     xlab = "Fitted Values",
     ylab = "Studentized Residuals",
     main = "Studentized Residuals")

abline(h = c(-2, 2), col = "blue", lty = 2)

# -----------------------------
# Plot: Studentized Deleted Residuals
# -----------------------------
plot(fitted(model), stud_del_res,
     col = ifelse(abs(stud_del_res) > stud_del_cut, "red", "black"),
     pch = 19,
     xlab = "Fitted Values",
     ylab = "Studentized Deleted Residuals",
     main = "Externally Studentized Residuals")

abline(h = c(-2, 2), col = "blue", lty = 2)

Code
# -----------------------------
# Print detected outliers
# -----------------------------
outliers <- data[data$outlier_flag == "Outlier", ]
print(outliers)
       H   PL   AL   FL MF residuals   std_res  stud_res stud_del_res
8  172.0 10.0 30.0 15.0  M  18.23369  1.967265  2.009503     2.009503
13 132.0 15.5 25.5 21.0  F -31.79726 -3.282379 -3.542885    -3.542885
45 144.0 15.0 19.5 20.0  F -19.63118 -2.037365 -2.085539    -2.085539
72 139.7 15.4 21.7 21.8  F -23.40122 -2.416936 -2.506471    -2.506471
   outlier_flag
8       Outlier
13      Outlier
45      Outlier
72      Outlier
Code
# -----------------------------
# Remove outliers
# -----------------------------
data_clean <- data[data$outlier_flag == "Normal", ]

# -----------------------------
# Refit model without outliers
# -----------------------------
model_clean <- lm(H ~ AL + PL + FL, data = data_clean)

# -----------------------------
# Compare coefficients
# -----------------------------
coef_before <- coef(model)
coef_after  <- coef(model_clean)

comparison <- data.frame(
  Coefficient = names(coef_before),
  With_Outliers = coef_before,
  Without_Outliers = coef_after,
  Difference = coef_after - coef_before
)

print("Comparison of Regression Coefficients:")
[1] "Comparison of Regression Coefficients:"
Code
print(comparison)
            Coefficient With_Outliers Without_Outliers  Difference
(Intercept) (Intercept)  141.09866911     143.90387462  2.80520551
AL                   AL   -0.05856177      -0.09224297 -0.03368121
PL                   PL    2.66507906       2.72647019  0.06139113
FL                   FL   -0.81508620      -0.90266346 -0.08757726
Code
# -----------------------------
# Compare model summaries
# -----------------------------
cat("\n--- Model WITH Outliers ---\n")

--- Model WITH Outliers ---
Code
print(summary(model))

Call:
lm(formula = H ~ AL + PL + FL, data = data)

Residuals:
    Min      1Q  Median      3Q     Max 
-31.797  -6.035   1.201   5.402  18.234 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 141.09867    7.66687  18.404  < 2e-16 ***
AL           -0.05856    0.08333  -0.703  0.48453    
PL            2.66508    0.77852   3.423  0.00104 ** 
FL           -0.81509    0.66243  -1.230  0.22265    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 9.792 on 70 degrees of freedom
Multiple R-squared:  0.2608,    Adjusted R-squared:  0.2291 
F-statistic: 8.231 on 3 and 70 DF,  p-value: 9.138e-05
Code
cat("\n--- Model WITHOUT Outliers ---\n")

--- Model WITHOUT Outliers ---
Code
print(summary(model_clean))

Call:
lm(formula = H ~ AL + PL + FL, data = data_clean)

Residuals:
     Min       1Q   Median       3Q      Max 
-17.0648  -5.8727   0.5875   4.5892  15.5012 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 143.90387    6.80300  21.153  < 2e-16 ***
AL           -0.09224    0.06962  -1.325    0.190    
PL            2.72647    0.65049   4.191 8.42e-05 ***
FL           -0.90266    0.55431  -1.628    0.108    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 8.137 on 66 degrees of freedom
Multiple R-squared:  0.3289,    Adjusted R-squared:  0.2984 
F-statistic: 10.78 on 3 and 66 DF,  p-value: 7.446e-06
Code
# -----------------------------
# Plot comparison (optional)
# -----------------------------
par(mfrow=c(1,2))

# Before
plot(fitted(model), residuals(model),
     main = "With Outliers",
     col = ifelse(data$outlier_flag == "Outlier", "red", "black"),
     pch = 19,
     xlab = "Fitted", ylab = "Residuals")
abline(h=0, lty=2)

# After
plot(fitted(model_clean), residuals(model_clean),
     main = "Without Outliers",
     col = "black",
     pch = 19,
     xlab = "Fitted", ylab = "Residuals")
abline(h=0, lty=2)

Code
# -----------------------------
# Count removed observations
# -----------------------------
cat("\nNumber of observations removed:",
    nrow(data) - nrow(data_clean), "\n")

Number of observations removed: 4