분위회귀분석

install.packages(“quantreg”)

install.packages(“reshape2”) install.packages(“ggplot2”)

set.seed(2918)

x <- rnorm(10000, 10, 1)  ## x생성

##error들을 생성한다. 

err  <- rnorm(10000, 0, 1)
err1 <- rlnorm(10000, 0, 0.25)
err2 <- rlnorm(10000, 0, 0.50)
err3 <- rlnorm(10000, 0, 1.25)

y1 <- 1 + 2*x + err
y2 <- 1 + 2*x + err1
y3 <- 1 + 2*x + err2
y4 <- 1 + 2*x + err3
y5 <- 1 + 2*x - err3
y6 <- 1 + 2*x + (1 + x)*err

qr.ch2 <- data.frame(x, y1, y2, y3, y4, y5, y6)

qr.ch2.1 <- data.frame()

for (i in 1:6)
{
  model <- rep(i, length(x))
  y.temp <- paste('y',i, sep='')
  y.temp2 <- get(y.temp)
  temp <- data.frame(x, y.temp2, model)
  qr.ch2.1 <- rbind(qr.ch2.1,temp)
}

names(qr.ch2.1) <- c("x", "y", "model")

library("ggplot2")

ggplot(qr.ch2.1, aes(x=x, y=y)) + geom_point() + facet_wrap( ~ model, ncol=3, scales="free")

정규분포 오차의 경우

library("quantreg")

## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## 
## The following object is masked from 'package:base':
## 
##     backsolve

model1 <- rq(y1 ~ x, tau = c(.05, .1, .25, .5, .75, .9, .95))

graph1 <- data.frame(predict(model1))

head(graph1)

##         X1       X2       X3       X4       X5       X6       X7
## 1 22.40155 22.78253 23.41086 24.04268 24.70200 25.30857 25.68997
## 2 22.92458 23.30621 23.93755 24.56121 25.22064 25.82329 26.20934
## 3 23.02994 23.41170 24.04365 24.66566 25.32512 25.92698 26.31396
## 4 20.23985 20.61815 21.23407 21.89963 22.55846 23.18121 23.54345
## 5 22.64006 23.02134 23.65105 24.27914 24.93851 25.54329 25.92681
## 6 20.35999 20.73844 21.35506 22.01874 22.67760 23.29945 23.66276

library(reshape2)

graph1.1 <- melt(graph1, variable.name = "model")

## No id variables; using all as measure variables

graph1.1$x <- rep(x,7)

head(graph1.1)

##   model    value        x
## 1    X1 22.40155 11.54046
## 2    X1 22.92458 11.80408
## 3    X1 23.02994 11.85718
## 4    X1 20.23985 10.45096
## 5    X1 22.64006 11.66068
## 6    X1 20.35999 10.51152

tail(graph1.1)

##       model    value         x
## 69995    X7 22.29931  9.819475
## 69996    X7 20.22136  8.764778
## 69997    X7 20.09480  8.700538
## 69998    X7 25.42637 11.406671
## 69999    X7 22.50585  9.924310
## 70000    X7 23.28016 10.317322

temp3 <- data.frame(rep("data",10000), y1, x)

names(graph1.1)

## [1] "model" "value" "x"

names(temp3) <- c("model", "value", "x")

graph1.1 <- rbind(graph1.1, temp3)

names(temp3)

## [1] "model" "value" "x"

ggplot(graph1.1, aes(x=x, y=value, color=model)) + geom_point(alpha=0.25)

대칭이지만 정규분포가 아닌 경우

model6 <- rq(y6 ~ x, tau = c(.05, .1, .25, .5, .75, .9, .95))

graph6 <- data.frame(predict(model6))

head(graph6)

##         X1       X2       X3       X4       X5       X6       X7
## 1 3.014280 7.869310 15.68607 23.64611 31.91730 39.53770 44.34424
## 2 3.053343 8.028839 16.03222 24.07913 32.52544 40.27553 45.22760
## 3 3.061211 8.060975 16.10194 24.16636 32.64794 40.42416 45.40555
## 4 2.852836 7.209982 14.25546 21.85647 29.40388 36.48826 40.69330
## 5 3.032093 7.942059 15.84392 23.84358 32.19463 39.87417 44.74707
## 6 2.861809 7.246627 14.33497 21.95593 29.54357 36.65774 40.89622

graph6.1 <- melt(graph6, variable.name = "model")

## No id variables; using all as measure variables

graph6.1$x <- rep(x,7)

head(graph6.1)

##   model    value        x
## 1    X1 3.014280 11.54046
## 2    X1 3.053343 11.80408
## 3    X1 3.061211 11.85718
## 4    X1 2.852836 10.45096
## 5    X1 3.032093 11.66068
## 6    X1 2.861809 10.51152

tail(graph6.1)

##       model    value         x
## 69995    X7 38.57717  9.819475
## 69996    X7 35.04287  8.764778
## 69997    X7 34.82760  8.700538
## 69998    X7 43.89589 11.406671
## 69999    X7 38.92848  9.924310
## 70000    X7 40.24547 10.317322

temp3 <- data.frame(rep("data",10000), y6, x)

names(graph6.1)

## [1] "model" "value" "x"

names(temp3) <- c("model", "value", "x")

graph6.1 <- rbind(graph6.1, temp3)

head(graph6.1)

##   model    value        x
## 1    X1 3.014280 11.54046
## 2    X1 3.053343 11.80408
## 3    X1 3.061211 11.85718
## 4    X1 2.852836 10.45096
## 5    X1 3.032093 11.66068
## 6    X1 2.861809 10.51152

tail(graph6.1)

##       model    value         x
## 79995  data 22.63648  9.819475
## 79996  data 36.66026  8.764778
## 79997  data 27.96781  8.700538
## 79998  data 36.29047 11.406671
## 79999  data 22.20990  9.924310
## 80000  data  5.60435 10.317322

ggplot(graph6.1, aes(x=x, y=value, color=model)) + geom_point(alpha=0.25)

오차의 skew가 심한 경우 1

model4 <- rq(y4 ~ x, tau = c(.05, .1, .25, .5, .75, .9, .95))

graph4 <- data.frame(predict(model4))
graph4.1 <- melt(graph4, variable.name = "model")

## No id variables; using all as measure variables

graph4.1$x <- rep(x,7)

temp3 <- data.frame(rep("data",10000), y4, x)

names(temp3) <- c("model", "value", "x")

graph4.1 <- rbind(graph4.1, temp3)

ggplot(graph4.1, aes(x=x, y=value, color=model)) + geom_point(alpha=0.25) + ylim(0, 75)

## Warning: Removed 7 rows containing missing values (geom_point).

오차의 skew가 심한 경우 2

model5 <- rq(y5 ~ x, tau = c(.05, .1, .25, .5, .75, .9, .95))

graph5 <- data.frame(predict(model5))
graph5.1 <- melt(graph5, variable.name = "model")

## No id variables; using all as measure variables

graph5.1$x <- rep(x,7)

temp3 <- data.frame(rep("data",10000), y5, x)

names(temp3) <- c("model", "value", "x")

graph5.1 <- rbind(graph5.1, temp3)

ggplot(graph5.1, aes(x=x, y=value, color=model)) + geom_point(alpha=0.25) + ylim(-25, 30)

## Warning: Removed 11 rows containing missing values (geom_point).