In this part we will use smoothing to help us see functional relationships in the presence of noise.
The first case will use Nadaraya-Watson Kernel Smoothes, which simply uses a weighted average of y values whose x values are close to the prediction point xp. It uses a kernel function provide the distance weights (the kernel function is a density function, non-negative function that integrates to 1). Conceptually the underlying kernel is continuous, but it is being used is for a discrete probability distribution.
set.seed(19)
x <- runif(20, min = 0, max = 10)
y <- 2 * (x - 5)^2 - 0.3 * x + 2 + rnorm(20, sd = 2) # quadratic curve with error.
xp <- 6 # prediction location
h <- 0.5 # smoothing parameter
w <- dnorm((x - xp)/h) #preliminary weights from a normal kernel
wNew <- w/sum(w) #new weights
yhat <- sum(wNew * y)
plot(x, y, pch = 16, main = "Normal Kernel: The red point is smoothed estimate")
points(xp, yhat, pch = 16, col = "red", cex = 1.1)
ord <- order(x)
cbind(x, y, wNew)[ord, ] # Points close to xp=6 get the heaviest weight
## x y wNew
## [1,] 0.684 38.6577 1.186e-25
## [2,] 1.171 32.0037 2.336e-21
## [3,] 2.239 16.0530 2.161e-13
## [4,] 2.282 14.0927 4.111e-13
## [5,] 2.919 8.6420 2.369e-09
## [6,] 3.653 5.8522 6.856e-06
## [7,] 3.975 2.6606 1.147e-04
## [8,] 4.065 5.3772 2.330e-04
## [9,] 4.274 -1.2539 1.077e-03
## [10,] 4.497 -0.9288 4.561e-03
## [11,] 4.840 2.3659 2.835e-02
## [12,] 5.677 5.4516 3.388e-01
## [13,] 5.733 4.1674 3.620e-01
## [14,] 6.512 5.7988 2.471e-01
## [15,] 7.261 8.6412 1.740e-02
## [16,] 7.910 16.5604 2.841e-04
## [17,] 8.366 23.1613 5.722e-06
## [18,] 8.382 22.8772 4.905e-06
## [19,] 9.050 33.1520 3.493e-09
## [20,] 9.242 37.3897 3.119e-10
Experimenting with Smooth Parameters
In smoothing and many areas of modeling, cross validation is a helpful tool in obtaining good parameters:
# first steps:
source("panelFunctions.r")
library(RCurl)
## Loading required package: bitops
filename = "https://docs.google.com/spreadsheet/pub?key=0AhVqDdZgThPldFFzV0ItcVEzYW5ENWNCQzJpam5OcHc&single=true&gid=0&output=csv"
myCsv <- getURL(filename)
hamster = read.csv(textConnection(myCsv), header = TRUE)
x <- hamster$hibPercent
y <- hamster$ageDays
rx <- range(x)
rx <- mean(rx) + 1.04 * diff(rx) * c(-0.5, 0.5) # pad the range of data
gridx <- panelInbounds(rx) # use the pretty function
gridx <- seq(0, 30, by = 10) #hand set values
ry <- range(y)
ry <- mean(ry) + 1.04 * diff(ry) * c(-0.5, 0.5)
gridy <- panelInbounds(ry)
gridy <- seq(100, 1600, by = 500) # hand set values
# Setting up Panel (3 rows 1 column)
pan <- panelLayout(nrow = 3, ncol = 1, rowSep = c(0, 0.1, 0.1, 0), leftMar = 1.5,
bottomMar = 0.7)
# First ROW: Smoothing with kernels (parameters:bandwidth=bandwith,kernel
# shape=kernel
panelSelect(pan, 1, 1)
panelScale(rx, ry)
## Warning: calling par(new=TRUE) with no plot
## $rx
## [1] -0.66 33.66
##
## $ry
## [1] 85.92 1650.08
panelFill("#C0C0C0")
abline(v = gridx, h = gridy, col = "white")
axis(side = 2, at = gridy, tck = 0, las = 1, mgp = c(2, 0.5, 0))
points(x, y)
lines(ksmooth(x, y, kernel = "normal", bandwidth = 1.5), lwd = 6, col = "black")
lines(ksmooth(x, y, kernel = "normal", bandwidth = 2.5), lwd = 4, col = rgb(0,
0.5, 1))
lines(ksmooth(x, y, kernel = "normal", bandwidth = 5), lwd = 3, col = "red")
panelOutline()
mtext(side = 2, line = 2.6, "Life Time In Days")
mtext(side = 3, line = 3, "Smoothing of Hamster Hybernation Data", adj = 0.5,
cex = 1.3)
mtext(side = 2, line = 4.2, "Kernel", las = 2)
# Second ROW: Smoothing With Loess (parameters: span=fraction of cases,
# degree=1 or 2
panelSelect(pan, 2, 1)
panelScale(rx, ry)
## $rx
## [1] -0.66 33.66
##
## $ry
## [1] 85.92 1650.08
panelFill("#C0C0C0")
abline(v = gridx, h = gridy, col = "white")
axis(side = 2, at = gridy, tck = 0, mgp = c(2, 0.5, 0), las = 2)
points(x, y)
lines(loess.smooth(x, y, span = 0.2), lwd = 6, col = "black")
lines(loess.smooth(x, y, span = 0.34), lwd = 4, col = rgb(0, 0.5, 1))
lines(loess.smooth(x, y, span = 0.51), lwd = 3, col = "red")
panelOutline()
text(-0.2, 0.5, "Loess", adj = 0.5, cex = 1.2)
mtext(side = 2, line = 2.6, "Life Time In Days")
mtext(side = 2, line = 4.2, "Loess", las = 2)
# Third ROW: Smoothing With Splines (parameters: approx. degrees of
# freedom in model= df; roughness penalty weight = spar knots points
panelSelect(pan, 3, 1)
panelScale(rx, ry)
## $rx
## [1] -0.66 33.66
##
## $ry
## [1] 85.92 1650.08
panelFill(col = "#C0C0C0")
par(xpd = TRUE)
abline(v = gridx, h = gridy, col = "white")
axis(side = 2, at = gridy, tck = 0, mgp = c(2, 0.5, 0), las = 2)
axis(side = 1, at = gridx, tck = 0, mgp = c(2, 0.05, 0))
nx <- x + runif(length(x), min = -0.05, max = 0.05) #add a bit of noise
points(nx, y)
lines(smooth.spline(nx, y, df = 15), lwd = 6, col = "black")
lines(smooth.spline(nx, y, df = 10), lwd = 4, col = rgb(0, 0.5, 1))
lines(smooth.spline(nx, y, df = 5), lwd = 2, col = "red")
panelOutline()
mtext(side = 2, line = 2.6, "Life Time In Days")
mtext(side = 1, line = 1.2, "Percent Time Hybernating")
mtext(side = 2, line = 4.2, "Splines", las = 2)
UPPER and LOWER SMOOTHES
A smooth splits the data into upper and lower sets. These sets can be smoothed with the result attached to the original smooth:
## Run
source("gridPlotFunctions.r")
ans <- loess(y ~ x)
res <- residuals(ans) # get the residuals from the regression objects
high <- res > 0
set1.x <- x[high]
set1.y <- res[high]
set2.x <- x[!high]
set2.y <- res[!high]
# smooth the upper and lower residuals
ans.high <- loess(set1.y ~ set1.x)
ans.low <- loess(set2.y ~ set2.x)
# add on the middle smooth
yhat <- predict(ans)
upper <- predict(ans.high) + yhat[high]
lower <- predict(ans.low) + yhat[!high]
par(mai = c(1, 1.2, 1, 0.5))
gridPlot(x, y, type = "n", main = "Upper and Lower Smooths", xlab = "Percent of Life Hybernating",
ylab = "Lifetime (Days)", lineYlabel = 3.2, titleCex = 1.2)
lines(x, yhat, lwd = 4)
lines(set1.x, upper, lwd = 2, col = "red")
lines(set2.x, lower, lwd = 2, col = "red")
points(x, y, col = "black")
SMOOTHING with GGPLOT2
Loess Smooths
library(ggplot2)
qplot(hibPercent, ageDays, data = hamster, geom = c("point", "smooth"), method = "loess",
span = 0.6, degree = 1)
Smoothing Splines
library(splines)
qplot(hibPercent, ageDays, data = hamster, main = "Hamster Hiberation and Longevity",
xlab = "Percent of Days in Hiberation", ylab = "Age in Days", geom = c("point",
"smooth"), method = "lm", formula = y ~ ns(x, 3))