Quick'n'dirty quantile regression starter

Trying to fit an equation to specific quantiles of the data to get a boundary line.

Using the package quantreg

See http://cran.r-project.org/web/packages/quantreg/vignettes/rq.pdf

Manual: http://cran.r-project.org/web/packages/quantreg/quantreg.pdf

A paper on quantile regression from an ecology perspective: http://www.econ.uiuc.edu/~roger/research/rq/QReco.pdf

Analysis of crop yield boundary lines using quantile regression Makowski et al. (2007) Agronomy for Sustainable Development: http://dx.doi.org/10.1051/agro%3A2006029

install.packages("quantreg")

# Quantile regression
df <- read.csv("Gas_Exchange.csv")

# relationship between conductance temperature
# manually defined, wide x-range
# see plot at the bottom
plot(Cond ~ Tleaf, 
     data = df, 
     xlim = c(-10, 50))

# equation to use
# just a simple quadratic approach
# y ~ a * x^2 + b * x  + c

my.equation <- Cond ~ a * Tleaf^2 + b * Tleaf  + c

# fit the equation to the data via "non-linear least squares"
nls.fit <- nls(my.equation,
               data = df,
               start = list(a = 2, b = 3, c = 5))

# look at the result
summary(nls.fit)

## 
## Formula: Cond ~ a * Tleaf^2 + b * Tleaf + c
## 
## Parameters:
##    Estimate Std. Error t value Pr(>|t|)   
## a -0.000899   0.000276   -3.26   0.0011 **
## b  0.039850   0.012547    3.18   0.0015 **
## c  0.060984   0.138156    0.44   0.6590   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.285 on 920 degrees of freedom
## 
## Number of iterations to convergence: 1 
## Achieved convergence tolerance: 2.05e-06


# create a dummy range of that we use to predict leaf temperature from our fitted model
predict_range <- data.frame(Tleaf = seq(-10, 50, length = 250))

# calculate for each x-range value the corresponding y-range
my.line <- within(predict_range, Cond <- predict(nls.fit, newdata = predict_range))

# add the line to the existing graph
# This line represents the "mean" fit, no quantile regression involved
lines(Cond ~ Tleaf, data = my.line, col = "red")


library(quantreg)

## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## 
## The following object is masked from 'package:base':
## 
##     backsolve


# Non-linear quantile regression
# aiming for the upper 95% quantile
my.rq <- nlrq(my.equation,
           data = df,
           start = list(a = 2, b = 2, c = 5),
           tau = .95)
summary(my.rq)

## 
## Call: nlrq(formula = my.equation, data = df, start = list(a = 2, b = 2, 
##     c = 5), tau = 0.95, control = structure(list(maxiter = 100, 
##     k = 2, InitialStepSize = 1, big = 1e+20, eps = 1e-07, beta = 0.97), .Names = c("maxiter", 
## "k", "InitialStepSize", "big", "eps", "beta")), trace = FALSE)
## 
## tau: [1] 0.95
## 
## Coefficients:
##   Value    Std. Error t value  Pr(>|t|)
## a -0.00319  0.00055   -5.78948  0.00000
## b  0.15240  0.02303    6.61869  0.00000
## c -0.66404  0.21573   -3.07813  0.00214


# calculating the values from the model
my.line95 <- within(predict_range, 
             Cond <- predict(my.rq, 
                            newdata = predict_range))

lines(Cond ~ Tleaf, data = my.line95, col = "blue")

# re-doing the 99% quantile
# nlrq can be supplied with a list of "tau" values, to get multiple quantiles in one go.
# For now I'm doing one by one only...

my.rq.99 <- nlrq(my.equation,
           data = df,
           start = list(a = 2, b = 2, c = 5),
           tau = .99)
summary(my.rq.99)

## 
## Call: nlrq(formula = my.equation, data = df, start = list(a = 2, b = 2, 
##     c = 5), tau = 0.99, control = structure(list(maxiter = 100, 
##     k = 2, InitialStepSize = 1, big = 1e+20, eps = 1e-07, beta = 0.97), .Names = c("maxiter", 
## "k", "InitialStepSize", "big", "eps", "beta")), trace = FALSE)
## 
## tau: [1] 0.99
## 
## Coefficients:
##   Value    Std. Error t value  Pr(>|t|)
## a -0.00371  0.00061   -6.09065  0.00000
## b  0.17750  0.02470    7.18658  0.00000
## c -0.79848  0.23541   -3.39184  0.00072


# the usual calculation of the corresponding line
my.line99 <- within(predict_range, 
             Cond <- predict(my.rq.99, 
                            newdata = predict_range))

lines(Cond ~ Tleaf, data = my.line99, col = "green")

# Add a legend to the graph
legend(0, 1.2, 
       c("Mean", "95%", "99%"),
       lty = c(rep("solid", 3)),
       col = c("red", "blue", "green"),
       cex = 0.8) # make the characters slightly smaller than standard text

plot of chunk unnamed-chunk-1

Another example: Sigmoid

# import the data
df <- read.csv("FC.csv")

plot(rel.gs ~ X.FC,
      data = df,
      xlim = c(0, 100))

my.equation <- rel.gs ~ a/(1+exp(-(X.FC-c)/b))
my.start <- list(a = 0.5, b = 1, c = 50)

nls.fit <- nls(my.equation,
               data = df,
               start = my.start)

summary(nls.fit)

## 
## Formula: rel.gs ~ a/(1 + exp(-(X.FC - c)/b))
## 
## Parameters:
##   Estimate Std. Error t value Pr(>|t|)    
## a   0.8321     0.0851    9.78  < 2e-16 ***
## b  18.7708     3.7912    4.95  1.6e-06 ***
## c  61.0794     6.2147    9.83  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.212 on 197 degrees of freedom
## 
## Number of iterations to convergence: 8 
## Achieved convergence tolerance: 1.82e-06


# create a dummy data frame that holds the predicted data form the model
predict_range <- data.frame(X.FC = seq(0, 100, length = 200))
my.line <- within(predict_range, rel.gs <- predict(nls.fit, newdata = predict_range))

# add the predicted line to the base plot
lines(rel.gs ~ X.FC, data = my.line, col = "blue")

# separate fits for AA and AM
nls.aneura <- nls(my.equation,
               data = df[df$Species == "A. aneura",],
               start = my.start)
summary(nls.aneura)

## 
## Formula: rel.gs ~ a/(1 + exp(-(X.FC - c)/b))
## 
## Parameters:
##   Estimate Std. Error t value Pr(>|t|)    
## a   0.7985     0.0504   15.83  < 2e-16 ***
## b  12.9359     3.2876    3.93  0.00016 ***
## c  49.9436     4.5582   10.96  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.193 on 97 degrees of freedom
## 
## Number of iterations to convergence: 10 
## Achieved convergence tolerance: 5.3e-06


nls.melanoxylon <- nls(my.equation,
               data = df[df$Species == "A. melanoxylon", ], 
               start = my.start,
               algorithm = "port",
               trace = FALSE,
               nls.control(maxiter = 100000, warnOnly = TRUE, 
                           minFactor = 0.000000000009))

## Warning: Convergence failure: function evaluation limit reached without
## convergence (9)


# Does not work well with these data, unfortunately
summary(nls.melanoxylon)

## 
## Formula: rel.gs ~ a/(1 + exp(-(X.FC - c)/b))
## 
## Parameters:
##   Estimate Std. Error t value Pr(>|t|)  
## a    395.3   184135.8    0.00    0.998  
## b     37.0       19.1    1.94    0.055 .
## c    333.4    17403.1    0.02    0.985  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.196 on 97 degrees of freedom
## 
## Algorithm "port", convergence message: function evaluation limit reached without convergence (9)


# quantile regression for the upper 95%
library(quantreg)

my.rq <- nlrq(my.equation,
              data = df,
              start = list(a = 2, b = 2, c = 5),
              tau = .95)
summary(my.rq)

## 
## Call: nlrq(formula = my.equation, data = df, start = list(a = 2, b = 2, 
##     c = 5), tau = 0.95, control = structure(list(maxiter = 100, 
##     k = 2, InitialStepSize = 1, big = 1e+20, eps = 1e-07, beta = 0.97), .Names = c("maxiter", 
## "k", "InitialStepSize", "big", "eps", "beta")), trace = FALSE)
## 
## tau: [1] 0.95
## 
## Coefficients:
##   Value    Std. Error t value  Pr(>|t|)
## a  1.00532  0.01539   65.33274  0.00000
## b 12.71319  4.52469    2.80974  0.00546
## c 30.80014  5.12539    6.00932  0.00000


my.line95 <- within(predict_range, 
                    rel.gs <- predict(my.rq, 
                    newdata = predict_range))

lines(rel.gs ~ X.FC, data = my.line95, col = "green")

plot of chunk unnamed-chunk-2

# separate code for AA and AM 95% quantile regression using nlrq
# deliberately not using split-apply or plyr to show the individual process 
my.rq.95.aa <- nlrq(my.equation,
                 data = df[df$Species == "A. aneura",],
                 start = list(a = 2, b = 2, c = 5),
                 tau = .95)
summary(my.rq.95.aa)

## 
## Call: nlrq(formula = my.equation, data = df[df$Species == "A. aneura", 
##     ], start = list(a = 2, b = 2, c = 5), tau = 0.95, control = structure(list(
##     maxiter = 100, k = 2, InitialStepSize = 1, big = 1e+20, eps = 1e-07, 
##     beta = 0.97), .Names = c("maxiter", "k", "InitialStepSize", 
## "big", "eps", "beta")), trace = FALSE)
## 
## tau: [1] 0.95
## 
## Coefficients:
##   Value    Std. Error t value  Pr(>|t|)
## a  0.99849  0.01304   76.60056  0.00000
## b  9.45893  4.60588    2.05367  0.04270
## c 22.53856  5.97793    3.77030  0.00028


my.rq.95.am <- nlrq(my.equation,
                 data = df[df$Species == "A. melanoxylon",],
                 start = list(a = 2, b = 2, c = 5),
                 tau = .95)
summary(my.rq.95.am)

## 
## Call: nlrq(formula = my.equation, data = df[df$Species == "A. melanoxylon", 
##     ], start = list(a = 2, b = 2, c = 5), tau = 0.95, control = structure(list(
##     maxiter = 100, k = 2, InitialStepSize = 1, big = 1e+20, eps = 1e-07, 
##     beta = 0.97), .Names = c("maxiter", "k", "InitialStepSize", 
## "big", "eps", "beta")), trace = FALSE)
## 
## tau: [1] 0.95
## 
## Coefficients:
##   Value    Std. Error t value  Pr(>|t|)
## a  1.16485  0.23508    4.95510  0.00000
## b 27.76214 15.37038    1.80621  0.07399
## c 49.95251 15.62986    3.19597  0.00188


# get the data to put lines on the grpah
my.line95.aa <- within(predict_range, 
                    rel.gs <- predict(my.rq.95.aa, 
                    newdata = predict_range))
my.line95.aa$Species <- "A. aneura"

my.line95.am <- within(predict_range, 
                    rel.gs <- predict(my.rq.95.am, 
                    newdata = predict_range))
my.line95.am$Species <- "A. melanoxylon"

# combine the two data sets
quantile.reg <- rbind(my.line95.aa, my.line95.am)
quantile.reg$Species <- as.factor(quantile.reg$Species)

# ggplot Graph:
library(ggplot2)
p <- ggplot(df, aes(x = X.FC, y = rel.gs))
     p <- p + geom_point(aes(colour = Species))
     p <- p + geom_line(data = quantile.reg, aes(linetype = Species))
     p <- p + theme_bw()
p

plot of chunk unnamed-chunk-3