Preparation for group presentation

References

Load packages

## ggplot2 for plotting
library(ggplot2)
## for unit()
library(gridExtra)

Functional forms

## Define n-th power of x functions (n>=1)
fx                <- function(x) x^1.0
fxOneAndHalfPower <- function(x) x^1.5
fxSquared         <- function(x) x^2.0
fxTwoAndHalfPower <- function(x) x^2.5
fxCubed           <- function(x) x^3.0

## Define n-th power of x functions (n<=1)
fxPointEightPower <- function(x) x^0.8
fxHalfPower       <- function(x) x^0.5
fxOneThirdPower   <- function(x) x^0.3
fxOneTenthPower   <- function(x) x^0.1

## Define negative powers of x functions
fxNegHalfPower    <- function(x) x^(-0.5)
fxNegOnePower     <- function(x) x^(-1.0)
fxNegTwoPower     <- function(x) x^(-2.0)

## Define non-power functions
fxExp             <- function(x) exp(x)
fxNegExp          <- function(x) exp(-x)
fxLog             <- function(x) log(x)

## Define a helper function
stat_fun <- function(fun, mapping) {
    stat_function(fun = fun, mapping = mapping, n = 1001)
}

## Provide base for plots
plotBase <- ggplot(data = data.frame(x = -1:3),
       mapping = aes(x = x)) +
    labs(color = "Functions", linetype = "Functions") +
    scale_y_continuous(limit = c(-2,5)) +
    theme_bw() +
    theme(legend.key = element_blank(),
          legend.key.width = unit(1, "cm"))

## n-th power of x functions (n>=1)
plotBase +
    stat_fun(fun = fx,                mapping = aes(linetype = "x^1.0")) +
    stat_fun(fun = fxOneAndHalfPower, mapping = aes(linetype = "x^1.5")) +
    stat_fun(fun = fxSquared,         mapping = aes(linetype = "x^2.0")) +
    stat_fun(fun = fxTwoAndHalfPower, mapping = aes(linetype = "x^2.5")) +
    stat_fun(fun = fxCubed,           mapping = aes(linetype = "x^3.0"))

plot of chunk unnamed-chunk-3


## n-th power of x functions (n<=1)
plotBase +
    stat_fun(fun = fx,                mapping = aes(linetype = "x^1.0")) +
    stat_fun(fun = fxPointEightPower, mapping = aes(linetype = "x^0.8")) +
    stat_fun(fun = fxHalfPower,       mapping = aes(linetype = "x^0.5")) +
    stat_fun(fun = fxOneThirdPower,   mapping = aes(linetype = "x^0.3")) +
    stat_fun(fun = fxOneTenthPower,   mapping = aes(linetype = "x^0.1")) +
    scale_linetype_manual(values = c("x^1.0" = 1,"x^0.8" = 2,"x^0.5" = 3,"x^0.3" = 4,"x^0.1" = 5),
                          guide = guide_legend(reverse = TRUE))

plot of chunk unnamed-chunk-3


## Negative powers of x functions
plotBase +
    stat_fun(fun = fxNegHalfPower, mapping = aes(linetype = "x^(-0.5)")) +
    stat_fun(fun = fxNegOnePower,  mapping = aes(linetype = "x^(-1.0)")) +
    stat_fun(fun = fxNegTwoPower,  mapping = aes(linetype = "x^(-2.0)"))

plot of chunk unnamed-chunk-3


## Non-power functions
plotBase +
    stat_fun(fun = fxExp,    mapping = aes(linetype = "exp(x)")) +
    stat_fun(fun = fxNegExp, mapping = aes(linetype = "exp(-x)")) +
    stat_fun(fun = fxLog,    mapping = aes(linetype = "log(x)"))

plot of chunk unnamed-chunk-3

Create dataset

## Create data
aidsData <- read.table(header = TRUE, text = "
year yearSince1976 newAidsCases personYears hivRateEstimate
1979             3            0         362               0
1980             4            0         360             100
1981             5            4         355             519
1982             6           10         353              76
1983             7           46         351             728
1984             8           92         349            1744
1985             9          201         348             433
1986            10          329         348              65
1987            11          456         348            1820
1988            12          476         347             111
1989            13          606         347            2013
1990            14          642         348             192
1991            15          791         344              57
1992            16          645         339             119
")

aidsData$personYears <- aidsData$personYears*1000

## Check
print(aidsData, row.names = F)
##  year yearSince1976 newAidsCases personYears hivRateEstimate
##  1979             3            0      362000               0
##  1980             4            0      360000             100
##  1981             5            4      355000             519
##  1982             6           10      353000              76
##  1983             7           46      351000             728
##  1984             8           92      349000            1744
##  1985             9          201      348000             433
##  1986            10          329      348000              65
##  1987            11          456      348000            1820
##  1988            12          476      347000             111
##  1989            13          606      347000            2013
##  1990            14          642      348000             192
##  1991            15          791      344000              57
##  1992            16          645      339000             119

Graphing data naively

plot(hivRateEstimate ~ yearSince1976, data = aidsData, type = "l")

plot of chunk unnamed-chunk-5


library(ggplot2)
qplot(x = year, y = hivRateEstimate, data = aidsData, geom = "line")

plot of chunk unnamed-chunk-5


aidsData$yearCat <- cut(aidsData$year,
                        breaks = c(-Inf,1980,1982,1984,1987,Inf),
                        labels = c("-1980","1981-1982","1983-1984","1985-1987","1988-"))


## glm1Categorical <- glm(formula = newAidsCases ~ yearCat + offset(log(personYears)),
##                        family  = poisson(link = "log"),
##                        data    = aidsData)

glm1Categorical <- glm(formula = hivRateEstimate ~ yearCat,
                       family  = poisson(link = "log"),
                       data    = aidsData)

newdata <- data.frame(year = seq(from = 1979, to = 1992, by = 0.05))
newdata$yearCat <- cut(newdata$year,
                       breaks = c(-Inf,1980,1982,1984,1987,Inf),
                       labels = c("-1980","1981-1982","1983-1984","1985-1987","1988-"))
## newdata <- data.frame(yearCat = levels(aidsData$yearCat), personYears = 100000)
## newdata

out <- predict(glm1Categorical, newdata = newdata)
## out
newdata$predCount <- round(exp(out), 3)
## newdata

ggplot(data = newdata,
       mapping = aes(x = year, y = predCount)) +
    layer(geom = "line") +
    theme_bw() +
    theme(legend.key = element_blank())

plot of chunk unnamed-chunk-6




## Fractional polynomial
glm2Fp <- glm(formula = hivRateEstimate ~ I(year^(1/2)) + year + I(year^(3/2) + I(year^2)),
                       family  = poisson(link = "log"),
                       data    = aidsData)
out <- predict(glm2Fp, newdata = newdata)
## out
newdata$predCount <- round(exp(out), 3)
## newdata

ggplot(data = newdata,
       mapping = aes(x = year, y = predCount)) +
    layer(geom = "line") +
    theme_bw() +
    theme(legend.key = element_blank())

plot of chunk unnamed-chunk-6



## Linear Spline
aidsData <- within(aidsData, {

    year1981 <- pmax(year - 1981, 0)
    year1983 <- pmax(year - 1983, 0)
    year1985 <- pmax(year - 1985, 0)
    year1988 <- pmax(year - 1988, 0)
 })

aidsData
##    year yearSince1976 newAidsCases personYears hivRateEstimate   yearCat year1988 year1985 year1983 year1981
## 1  1979             3            0      362000               0     -1980        0        0        0        0
## 2  1980             4            0      360000             100     -1980        0        0        0        0
## 3  1981             5            4      355000             519 1981-1982        0        0        0        0
## 4  1982             6           10      353000              76 1981-1982        0        0        0        1
## 5  1983             7           46      351000             728 1983-1984        0        0        0        2
## 6  1984             8           92      349000            1744 1983-1984        0        0        1        3
## 7  1985             9          201      348000             433 1985-1987        0        0        2        4
## 8  1986            10          329      348000              65 1985-1987        0        1        3        5
## 9  1987            11          456      348000            1820 1985-1987        0        2        4        6
## 10 1988            12          476      347000             111     1988-        0        3        5        7
## 11 1989            13          606      347000            2013     1988-        1        4        6        8
## 12 1990            14          642      348000             192     1988-        2        5        7        9
## 13 1991            15          791      344000              57     1988-        3        6        8       10
## 14 1992            16          645      339000             119     1988-        4        7        9       11

glm2Ls <- glm(formula = hivRateEstimate ~ year + year1981 + year1983 + year1985 + year1988,
                       family  = poisson(link = "log"),
                       data    = aidsData)

newdata <- within(newdata, {

    year1981 <- pmax(year - 1981, 0)
    year1983 <- pmax(year - 1983, 0)
    year1985 <- pmax(year - 1985, 0)
    year1988 <- pmax(year - 1988, 0)
 })


out <- predict(glm2Ls, newdata = newdata)
## out
newdata$predCount <- round(exp(out), 3)
## newdata

ggplot(data = newdata,
       mapping = aes(x = year, y = predCount)) +
    layer(geom = "line") +
    scale_y_log10() +
    theme_bw() +
    theme(legend.key = element_blank())

plot of chunk unnamed-chunk-6



## Qubic Spline
glm2Qs <- glm(formula = hivRateEstimate ~ year + I(year^2) + I(year1981^2) + I(year1983^2) + I(year1985^2) + I(year1988^2),
                       family  = poisson(link = "log"),
                       data    = aidsData)

newdata <- data.frame(year = seq(from = 1979, to = 1992, by = 0.05))
newdata <- within(newdata, {

    year1981 <- pmax(year - 1981, 0)
    year1983 <- pmax(year - 1983, 0)
    year1985 <- pmax(year - 1985, 0)
    year1988 <- pmax(year - 1988, 0)
 })


out <- predict(glm2Qs, newdata = newdata)
newdata$predCount <- round(exp(out), 3)


ggplot(data = newdata,
       mapping = aes(x = year, y = predCount)) +
    layer(geom = "line") +
    scale_y_log10() +
    theme_bw() +
    theme(legend.key = element_blank())

plot of chunk unnamed-chunk-6

mfp example

## Load library
library(mfp)

## Load data and fix error
data(bodyfat)
bodyfat$height[bodyfat$case==42] <- 69.5   # apparent error
bodyfat <- bodyfat[-which(bodyfat$case==39),]  # cp. Royston $amp$ Sauerbrei, 2004

summary(bodyfat)
##       case           brozek          siri         density           age           weight        height    
##  Min.   :  1.0   Min.   : 0.0   Min.   : 0.0   Min.   :0.995   Min.   :22.0   Min.   :118   Min.   :64.0  
##  1st Qu.: 64.5   1st Qu.:12.8   1st Qu.:12.4   1st Qu.:1.042   1st Qu.:35.5   1st Qu.:159   1st Qu.:68.2  
##  Median :127.0   Median :19.0   Median :19.2   Median :1.055   Median :43.0   Median :176   Median :70.0  
##  Mean   :126.8   Mean   :18.9   Mean   :19.1   Mean   :1.056   Mean   :44.9   Mean   :178   Mean   :70.3  
##  3rd Qu.:189.5   3rd Qu.:24.6   3rd Qu.:25.2   3rd Qu.:1.070   3rd Qu.:54.0   3rd Qu.:197   3rd Qu.:72.2  
##  Max.   :252.0   Max.   :45.1   Max.   :47.5   Max.   :1.109   Max.   :81.0   Max.   :263   Max.   :77.8  
##       neck          chest          abdomen           hip            thigh           knee          ankle     
##  Min.   :31.1   Min.   : 79.3   Min.   : 69.4   Min.   : 85.0   Min.   :47.2   Min.   :33.0   Min.   :19.1  
##  1st Qu.:36.4   1st Qu.: 94.3   1st Qu.: 84.5   1st Qu.: 95.5   1st Qu.:56.0   1st Qu.:37.0   1st Qu.:22.0  
##  Median :38.0   Median : 99.6   Median : 90.9   Median : 99.3   Median :59.0   Median :38.5   Median :22.8  
##  Mean   :37.9   Mean   :100.7   Mean   : 92.3   Mean   : 99.7   Mean   :59.3   Mean   :38.5   Mean   :23.1  
##  3rd Qu.:39.4   3rd Qu.:105.3   3rd Qu.: 99.2   3rd Qu.:103.3   3rd Qu.:62.3   3rd Qu.:39.9   3rd Qu.:24.0  
##  Max.   :43.9   Max.   :128.3   Max.   :126.2   Max.   :125.6   Max.   :74.4   Max.   :46.0   Max.   :33.9  
##      biceps        forearm         wrist     
##  Min.   :24.8   Min.   :21.0   Min.   :15.8  
##  1st Qu.:30.2   1st Qu.:27.3   1st Qu.:17.6  
##  Median :32.0   Median :28.7   Median :18.3  
##  Mean   :32.2   Mean   :28.7   Mean   :18.2  
##  3rd Qu.:34.3   3rd Qu.:30.0   3rd Qu.:18.8  
##  Max.   :39.1   Max.   :34.9   Max.   :21.4

## siri Percent body fat using Siri's equation: 495/Density - 450

## Fit
mfpDf4 <- mfp(siri ~ fp(age,    df = 4, select = 0.1) +
                     fp(weight, df = 4, select = 0.1) +
                     fp(height, df = 4, select = 0.1),
              family = gaussian, data = bodyfat)

## Show
summary(mfpDf4)
## 
## Call:
## glm(formula = siri ~ I((weight/100)^-0.5) + I((height/100)^1) + 
##     I((age/100)^1), family = gaussian, data = bodyfat)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -12.824   -3.720   -0.361    3.779   12.852  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            203.13      14.84   13.69  < 2e-16 ***
## I((weight/100)^-0.5)  -123.92       6.90  -17.96  < 2e-16 ***
## I((height/100)^1)     -136.90      15.61   -8.77  3.0e-16 ***
## I((age/100)^1)          12.99       2.75    4.72  3.9e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 27.76)
## 
##     Null deviance: 17320.4  on 250  degrees of freedom
## Residual deviance:  6855.6  on 247  degrees of freedom
## AIC: 1552
## 
## Number of Fisher Scoring iterations: 2

newdata <- data.frame(age = 43.00, weight = seq(from = 118.5, to = 262.8, by = 0.5), height = 70.00)
head(newdata)
##   age weight height
## 1  43  118.5     70
## 2  43  119.0     70
## 3  43  119.5     70
## 4  43  120.0     70
## 5  43  120.5     70
## 6  43  121.0     70

newdata$predPercentFat <- predict(object = mfpDf4, newdata = newdata)

library(ggplot2)
ggplot(data = newdata,
       mapping = aes(x = weight, y = predPercentFat)) +
    layer(geom = "line") +
    layer(data = bodyfat, geom = "point", mapping = aes(y = siri)) +
    theme_bw() +
    theme(legend.key = element_blank())

plot of chunk unnamed-chunk-7