# artificial data created; with 2 obvious outliers
df <- tibble::tibble(x = 1:20, y = c(1:18, 49:50))
# plot; to visualize outliers created
ggplot(df, aes(x = x, y = y)) +
geom_point(size = 2) +
theme_bw(base_size = 12)

- Observations where
x = 19 and x = 20 are the outliers.
# fit simple linear model
fit_lm <- lm(y ~ x, data = df)
# fit robust linear model
fit_rlm <- rlm(y ~ x, data = df)
# ----- Object-oriented (OO) in R
# assess class and structure of model objects fit_lm and fit_rlm
class(fit_lm)
str(fit_lm)
class(fit_rlm)
str(fit_rlm)
fit_rlm structure has additional components on top of fit_rlm structure.
- This is implied as
fit_rlm has a list of 21 elements. While fit_lm has a list of 12 elements.
- Also,
fit_rlm has takes “rlm” and “lm” as its class. fit_lm class is “lm”.
# augment do not return weights(w) from iterated re-weighted least squares
dfa <- augment(fit_rlm)
dfa
## # A tibble: 20 x 6
## y x .fitted .resid .hat .sigma
## <int> <int> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 0.999 0.000546 0.206 10.3
## 2 2 2 2.00 0.000452 0.172 10.3
## 3 3 3 3.00 0.000358 0.143 10.3
## 4 4 4 4.00 0.000263 0.118 10.3
## 5 5 5 5.00 0.000169 0.0974 10.3
## 6 6 6 6.00 0.0000749 0.0808 10.3
## 7 7 7 7.00 -0.0000193 0.0685 10.3
## 8 8 8 8.00 -0.000114 0.0603 10.3
## 9 9 9 9.00 -0.000208 0.0564 10.3
## 10 10 10 10.0 -0.000302 0.0566 10.3
## 11 11 11 11.0 -0.000396 0.0611 10.3
## 12 12 12 12.0 -0.000490 0.0698 10.3
## 13 13 13 13.0 -0.000585 0.0827 10.3
## 14 14 14 14.0 -0.000679 0.0998 10.3
## 15 15 15 15.0 -0.000773 0.121 10.3
## 16 16 16 16.0 -0.000867 0.147 10.3
## 17 17 17 17.0 -0.000961 0.172 10.3
## 18 18 18 18.0 -0.00106 0.187 10.3
## 19 49 19 19.0 30.0 0.0000184 7.28
## 20 50 20 20.0 30.0 0.0000216 7.28
# Modify augment() in rlm class; to return weights from iterated re-weighted least squares (w);
augment.rlm <- function(x, ...){
aug <- broom:::augment.rlm(x, ...)
aug$w <- x$w
aug
}
dfa <- augment.rlm(fit_rlm)
dfa
## # A tibble: 20 x 7
## y x .fitted .resid .hat .sigma w
## <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 1 0.999 0.000546 0.206 10.3 1
## 2 2 2 2.00 0.000452 0.172 10.3 1
## 3 3 3 3.00 0.000358 0.143 10.3 1
## 4 4 4 4.00 0.000263 0.118 10.3 1
## 5 5 5 5.00 0.000169 0.0974 10.3 1
## 6 6 6 6.00 0.0000749 0.0808 10.3 1
## 7 7 7 7.00 -0.0000193 0.0685 10.3 1
## 8 8 8 8.00 -0.000114 0.0603 10.3 1
## 9 9 9 9.00 -0.000208 0.0564 10.3 1
## 10 10 10 10.0 -0.000302 0.0566 10.3 1
## 11 11 11 11.0 -0.000396 0.0611 10.3 1
## 12 12 12 12.0 -0.000490 0.0698 10.3 1
## 13 13 13 13.0 -0.000585 0.0827 10.3 1
## 14 14 14 14.0 -0.000679 0.0998 10.3 1
## 15 15 15 15.0 -0.000773 0.121 10.3 1
## 16 16 16 16.0 -0.000867 0.147 10.3 1
## 17 17 17 17.0 -0.000961 0.172 10.3 0.977
## 18 18 18 18.0 -0.00106 0.187 10.3 0.890
## 19 49 19 19.0 30.0 0.0000184 7.28 0.0000742
## 20 50 20 20.0 30.0 0.0000216 7.28 0.0000742
ggplot(dfa, aes(x, y, color = w)) +
geom_point(size = 2) +
geom_smooth(method = "lm", se = FALSE, color = "#C8008F") +
geom_smooth(method = "rlm", se = FALSE, color = "#027EB6") +
annotate("text", x = 20.5, y = c(21, 31),
label = c("rlm", "lm")) +
theme_bw(base_size = 12) +
labs(title = "Robust vs. non-robust linear models") +
scale_color_continuous(type = "viridis") # colour scheme (colour-blind friendly)
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using formula 'y ~ x'

- As shown by
w column in dfa after modifying augment(), w reduces the weight given to outliers(where x = 19 and x = 20) to estimate parameters in regression analysis. (coming up with line of best fit)
- Therefore, we expect
rlm model to have gentler slope since less weights given to the 2 outliers with high observed y values.
- Conversely, for
lm, equal weights (= 1) is applied to all observations (even to outliers) hence a steeper slope is expected.