rm(list = ls())
# Make sure to pick a good bin width for the histogram. Make sure to pick a
# good bandwith for the kernel smoother.
kcTemp <- c(43.8, 40.1, 49.2, 41.8, 34, 49.1, 47.8, 48.1, 37.6, 42, 43.7, 47.1,
47.7, 46.9, 36.5, 45, 48, 37.6, 42.2, 38.7, 45.2, 42.5, 43.1, 36, 47.4,
48.5, 47.1, 43.2, 43.8, 45.7)
df <- data.frame(temp = kcTemp)
x <- rnorm(100)
densi <- density(kcTemp, bw = 1.5)
# ggplot Overfit: Delta = 1
ggplot(df, aes(x = temp, y = ..density..)) + geom_histogram(bins = 8, closed = "left") +
geom_density(col = "red", bw = 1)# Too general: Delta = 2
ggplot(df, aes(x = temp, y = ..density..)) + geom_histogram(bins = 8, closed = "left") +
geom_density(col = "red", bw = 2)# Somewhere in the middle: Delta = 1.5
ggplot(df, aes(x = temp, y = ..density..)) + geom_histogram(bins = 8, closed = "left") +
geom_density(col = "red", bw = 1.5)rm(list = ls())
df <- as.data.frame(state.x77)
names(df) <- c("Population", "Income", "Illiteracy", "LifeExp", "Murder", "Grad",
"Frost", "Area")
df$GradSq <- df$Grad^2
head(df) Population Income Illiteracy LifeExp Murder Grad Frost Area
Alabama 3615 3624 2.1 69.05 15.1 41.3 20 50708
Alaska 365 6315 1.5 69.31 11.3 66.7 152 566432
Arizona 2212 4530 1.8 70.55 7.8 58.1 15 113417
Arkansas 2110 3378 1.9 70.66 10.1 39.9 65 51945
California 21198 5114 1.1 71.71 10.3 62.6 20 156361
Colorado 2541 4884 0.7 72.06 6.8 63.9 166 103766
GradSq
Alabama 1705.69
Alaska 4448.89
Arizona 3375.61
Arkansas 1592.01
California 3918.76
Colorado 4083.21
ggp <- ggplot(df, aes(x = Grad, y = Income)) + geom_point() + labs(title = "HS Grad Rate & Per Capita Income",
subtitle = "Points are individual states", x = "% HS Graduates (1970)",
y = "Per Capita Income (1974)")
ggp
Call:
lm(formula = Income ~ Grad, data = df)
Coefficients:
(Intercept) Grad
1931.10 47.16
Call:
lm(formula = Income ~ Grad + GradSq, data = df)
Coefficients:
(Intercept) Grad GradSq
-1505.424 183.196 -1.313
# Plot
ggp + stat_smooth(method = "lm", formula = y ~ poly(x, 2), se = FALSE, color = "red",
lwd = 0.5)# Fitting Loess regression models with spans between .1 and 1
fits <- data_frame(span = seq(from = 0.1, to = 1.5, by = 0.1)) %>% group_by(span) %>%
do(augment(loess(Income ~ Grad, df, degree = 2, span = .$span)))Warning: `data_frame()` is deprecated, use `tibble()`.
This warning is displayed once per session.
Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
parametric, : pseudoinverse used at 52.5
Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
parametric, : neighborhood radius 0.2
Warning in simpleLoess(y, x, w, span, degree = degree, parametric =
parametric, : reciprocal condition number 4.7944e-17
# Plotting smoothing parameters
p.gif <- ggplot(fits, aes(x = Grad, y = Income, frame = span)) + geom_point() +
geom_line(aes(y = .fitted), color = "red") + transition_states(span) + labs(title = "Loess Regression: tuning the span parameter",
subtitle = "Span: {closest_state}", x = "% HS Graduates (1970)", y = "Per Capita Income (1974)")
p.gifCall:
loess(formula = Income ~ Grad, data = df, span = 1.25)
Number of Observations: 50
Equivalent Number of Parameters: 3.2
Residual Standard Error: 476.9
rm(list = ls())
set.seed(1234)
X <- runif(100, 0, 10)
Y <- sin(X) + rnorm(100, 0, 0.3)
df <- data.frame(x = X, y = Y)# Fitting Loess regression models with spans between .1 and 1
fits <- data_frame(span = seq(from = 0.1, to = 1, by = 0.05)) %>% group_by(span) %>%
do(augment(loess(y ~ x, df, degree = 2, span = .$span)))
# Plotting smoothing parameters
p.gif <- ggplot(fits, aes(x, y, frame = span)) + geom_point() + geom_line(aes(y = .fitted),
color = "red") + transition_states(span) + labs(title = "Loess Regression: tuning the span parameter",
subtitle = "Span: {closest_state}")
p.gifBased on the animated graph, an appropriate values for span appear to be between 0.25 & 0.7.
set.seed(1234)
X <- runif(100, 0, 10)
Y <- sin(X) + rnorm(100, 0, Sigma[1])
df <- data.frame(x = X, y = Y)
g1 <- ggplot(df, aes(x, y)) + geom_point() + stat_smooth(method = "loess", se = FALSE,
color = "black", lwd = 0.5, span = 0.4) + labs(title = "σ = .1")
# Fitting Loess regression models with spans between .1 and 1
fits <- data_frame(span = seq(from = 0.1, to = 1, by = 0.05)) %>% group_by(span) %>%
do(augment(loess(y ~ x, df, degree = 2, span = .$span)))
# Plotting smoothing parameters
p.gif <- ggplot(fits, aes(x, y, frame = span)) + geom_point() + geom_line(aes(y = .fitted),
color = "red") + transition_states(span) + labs(title = "Loess Regression: σ = .1",
subtitle = "Span: {closest_state}")
p.gifset.seed(1234)
X <- runif(100, 0, 10)
Y <- sin(X) + rnorm(100, 0, Sigma[2])
df <- data.frame(x = X, y = Y)
g2 <- ggplot(df, aes(x, y)) + geom_point() + stat_smooth(method = "loess", se = FALSE,
color = "black", lwd = 0.5, span = 0.4) + labs(title = "σ = .5")
# Fitting Loess regression models with spans between .1 and 1
fits <- data_frame(span = seq(from = 0.1, to = 1, by = 0.05)) %>% group_by(span) %>%
do(augment(loess(y ~ x, df, degree = 2, span = .$span)))
# Plotting smoothing parameters
p.gif <- ggplot(fits, aes(x, y, frame = span)) + geom_point() + geom_line(aes(y = .fitted),
color = "red") + transition_states(span) + labs(title = "Loess Regression: σ = .5",
subtitle = "Span: {closest_state}")
p.gifset.seed(1234)
X <- runif(100, 0, 10)
Y <- sin(X) + rnorm(100, 0, Sigma[3])
df <- data.frame(x = X, y = Y)
g3 <- ggplot(df, aes(x, y)) + geom_point() + stat_smooth(method = "loess", se = FALSE,
color = "black", lwd = 0.5, span = 0.4) + labs(title = "σ = 1")
# Fitting Loess regression models with spans between .1 and 1
fits <- data_frame(span = seq(from = 0.1, to = 1, by = 0.05)) %>% group_by(span) %>%
do(augment(loess(y ~ x, df, degree = 2, span = .$span)))
# Plotting smoothing parameters
p.gif <- ggplot(fits, aes(x, y, frame = span)) + geom_point() + geom_line(aes(y = .fitted),
color = "red") + transition_states(span) + labs(title = "Loess Regression: σ = 1",
subtitle = "Span: {closest_state}")
p.gifset.seed(1234)
X <- runif(100, 0, 10)
Y <- sin(X) + rnorm(100, 0, Sigma[4])
df <- data.frame(x = X, y = Y)
g4 <- ggplot(df, aes(x, y)) + geom_point() + stat_smooth(method = "loess", se = FALSE,
color = "black", lwd = 0.5, span = 0.4) + labs(title = "σ = 1.5")
# Fitting Loess regression models with spans between .1 and 1
fits <- data_frame(span = seq(from = 0.1, to = 1, by = 0.05)) %>% group_by(span) %>%
do(augment(loess(y ~ x, df, degree = 2, span = .$span)))
# Plotting smoothing parameters
p.gif <- ggplot(fits, aes(x, y, frame = span)) + geom_point() + geom_line(aes(y = .fitted),
color = "red") + transition_states(span) + labs(title = "Loess Regression: σ = 1.5",
subtitle = "Span: {closest_state}")
p.gifIt seems that as random variability in the data increases, the range of reasonable spans shifts right and becomes wider. That is, for lower \(\sigma\) noise levels, the range of appropriate spans seems to be [.2,.6], but for higher \(\sigma\) noise levels a more appropriate range might be [.5,1+].
Comments on the similarities and differences between these three different fits.
Because the scatterplot of the data may be indicating some sort of quadratic relationship, the linear regression model with a quadratic effect (red) and the loess model (black) seem to better fit the data than the simple linear regression (blue). Depending on how \(\alpha\) is tuned, the loess regression model may overfit the data (lower bandwidth values) or may be too general (higher values), but somewhere in the goldilocks range is a tuning parameter value that allows the Loess model to more accurately illustrate the relationship between variables by performing least squares regressions on local subsets of data. When the tuning parameter is too high, the loess regression line fits similar to the linear regression model with a quadratic effect because the size of local subsets used for fitting isn’t small enough to pick up subtleties in the data.