<< Data Analysis >> Note part 8

Created on 13 Aug 2013
Revised on Thu Aug 15 15:01:13 2013

1. Smoothing

Key ideas

   Sometimes there are non-linear trends in data
   We can use "smoothing" to try to capture these
   Still a risk of overfitting
   Often hard to interpret

1.1 CD4 Data

require(RCurl)
## Loading required package: RCurl
## Loading required package: bitops
myCsv <- getURL("https://dl.dropboxusercontent.com/u/8272421/cd4.csv", ssl.verifypeer = FALSE)
cd4Data <- read.csv(textConnection(myCsv), col.names = c("time", "cd4", "age", 
    "packs", "drugs", "sex", "cesd", "id"))
cd4Data <- cd4Data[order(cd4Data$time), ]
head(cd4Data)
##        time  cd4   age packs drugs sex cesd    id
## 1279 -2.990  814  6.17     3     1   5   -3 30183
## 2190 -2.990  400 -6.02     0     0   3   -4 41406
## 1167 -2.984  467 13.94     0     1   1    0 30046
## 1427 -2.957  749 -4.54     0     1  -1   -7 30498
## 2032 -2.951 1218  5.57     3     1   5    3 41032
## 1813 -2.949 1015 -9.15     2     1   0   -7 40375

CD4 over time

plot(cd4Data$time, cd4Data$cd4, pch = 19, cex = 0.1)

plot of chunk unnamed-chunk-2

Average first 2 points

plot(cd4Data$time, cd4Data$cd4, pch = 19, cex = 0.1)
x = mean(cd4Data$time[1:2])
y = mean(cd4Data$cd4[1:2])
points(x, y, col = "blue", pch = 19)

plot of chunk unnamed-chunk-3

Average second and third points

plot(cd4Data$time, cd4Data$cd4, pch = 19, cex = 0.1)
x = mean(cd4Data$time[1:2])
y = mean(cd4Data$cd4[1:2])
points(x, y, col = "blue", pch = 19)
x = mean(cd4Data$time[2:3])
y = mean(cd4Data$cd4[2:3])
points(x, y, col = "blue", pch = 19)

plot of chunk unnamed-chunk-4

A moving average

plot(cd4Data$time, cd4Data$cd4, pch = 19, cex = 0.1)
aveTime <- aveCd4 <- rep(NA, length(3:(dim(cd4Data)[1] - 2)))
for (i in 3:(dim(cd4Data)[1] - 2)) {
    aveTime[i] <- mean(cd4Data$time[(i - 2):(i + 2)])
    aveCd4[i] <- mean(cd4Data$cd4[(i - 2):(i + 2)])
}
lines(aveTime, aveCd4, col = "blue", lwd = 3)

plot of chunk unnamed-chunk-5

Average more points

plot(cd4Data$time, cd4Data$cd4, pch = 19, cex = 0.1)
aveTime <- aveCd4 <- rep(NA, length(11:(dim(cd4Data)[1] - 10)))
for (i in 11:(dim(cd4Data)[1] - 2)) {
    aveTime[i] <- mean(cd4Data$time[(i - 10):(i + 10)])
    aveCd4[i] <- mean(cd4Data$cd4[(i - 10):(i + 10)])
}
lines(aveTime, aveCd4, col = "blue", lwd = 3)

plot of chunk unnamed-chunk-6

Average many more

plot(cd4Data$time, cd4Data$cd4, pch = 19, cex = 0.1)
aveTime <- aveCd4 <- rep(NA, length(201:(dim(cd4Data)[1] - 200)))
for (i in 201:(dim(cd4Data)[1] - 2)) {
    aveTime[i] <- mean(cd4Data$time[(i - 200):(i + 200)])
    aveCd4[i] <- mean(cd4Data$cd4[(i - 200):(i + 200)])
}
lines(aveTime, aveCd4, col = "blue", lwd = 3)

plot of chunk unnamed-chunk-7

A faster way

filtTime <- as.vector(filter(cd4Data$time, filter = rep(1, 200))/200)
filtCd4 <- as.vector(filter(cd4Data$cd4, filter = rep(1, 200))/200)
plot(cd4Data$time, cd4Data$cd4, pch = 19, cex = 0.1)
lines(filtTime, filtCd4, col = "blue", lwd = 3)

plot of chunk unnamed-chunk-8

Averaging = weighted sums

filtCd4 <- as.vector(filter(cd4Data$cd4, filter = rep(1, 4))/4)
filtCd4[2]
## [1] 607.5
sum(cd4Data$cd4[1:4] * rep(1/4, 4))
## [1] 607.5

Other weights -> should sum to one

ws = 10
tukey = function(x) pmax(1 - x^2, 0)^2
filt = tukey(seq(-ws, ws)/(ws + 1))
filt = filt/sum(filt)
plot(seq(-(ws), (ws)), filt, pch = 19)

plot of chunk unnamed-chunk-10

ws = 100
tukey = function(x) pmax(1 - x^2, 0)^2
filt = tukey(seq(-ws, ws)/(ws + 1))
filt = filt/sum(filt)
filtTime <- as.vector(filter(cd4Data$time, filter = filt))
filtCd4 <- as.vector(filter(cd4Data$cd4, filter = filt))
plot(cd4Data$time, cd4Data$cd4, pch = 19, cex = 0.1)
lines(filtTime, filtCd4, col = "blue", lwd = 3)

plot of chunk unnamed-chunk-11

Lowess (loess)

lw1 <- loess(cd4 ~ time, data = cd4Data)
plot(cd4Data$time, cd4Data$cd4, pch = 19, cex = 0.1)
lines(cd4Data$time, lw1$fitted, col = "blue", lwd = 3)

plot of chunk unnamed-chunk-12

Span

plot(cd4Data$time, cd4Data$cd4, pch = 19, cex = 0.1, ylim = c(500, 1500))
lines(cd4Data$time, loess(cd4 ~ time, data = cd4Data, span = 0.1)$fitted, col = "blue", 
    lwd = 3)
lines(cd4Data$time, loess(cd4 ~ time, data = cd4Data, span = 0.25)$fitted, col = "red", 
    lwd = 3)
lines(cd4Data$time, loess(cd4 ~ time, data = cd4Data, span = 0.76)$fitted, col = "green", 
    lwd = 3)

plot of chunk unnamed-chunk-13

Predicting with loess

tme <- seq(-2, 5, length = 100)
pred1 = predict(lw1, newdata = data.frame(time = tme), se = TRUE)
plot(tme, pred1$fit, col = "blue", lwd = 3, type = "l", ylim = c(0, 2500))
lines(tme, pred1$fit + 1.96 * pred1$se.fit, col = "red", lwd = 3)
lines(tme, pred1$fit - 1.96 * pred1$se.fit, col = "red", lwd = 3)
points(cd4Data$time, cd4Data$cd4, pch = 19, cex = 0.1)

plot of chunk unnamed-chunk-14

Splines in R

library(splines)
ns1 <- ns(cd4Data$time, df = 3)
oldpar = par(mfrow = c(1, 3))
plot(cd4Data$time, ns1[, 1])
plot(cd4Data$time, ns1[, 2])
plot(cd4Data$time, ns1[, 3])

plot of chunk unnamed-chunk-15

par(oldpar)

Regression with splines

lm1 <- lm(cd4Data$cd4 ~ ns1)
summary(lm1)
## 
## Call:
## lm(formula = cd4Data$cd4 ~ ns1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -751.0 -205.6  -30.3  175.3 2337.5 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1043.5       31.0   33.71   <2e-16 ***
## ns11          -774.9       29.8  -26.03   <2e-16 ***
## ns12          -630.1       72.6   -8.68   <2e-16 ***
## ns13          -397.6       37.7  -10.54   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 326 on 2372 degrees of freedom
## Multiple R-squared:   0.3,   Adjusted R-squared:  0.299 
## F-statistic:  339 on 3 and 2372 DF,  p-value: <2e-16

Fitted values

plot(cd4Data$time, cd4Data$cd4, pch = 19, cex = 0.1)
points(cd4Data$time, lm1$fitted, col = "blue", pch = 19, cex = 0.5)

plot of chunk unnamed-chunk-17