user2 — Sep 23, 2013, 3:56 AM
require("quantreg")
Loading required package: quantreg Loading required package: SparseM
Attaching package: 'SparseM'
The following object is masked from 'package:base':
backsolve
# ============
# Question one
# ------------
noise <- function(n, p, c, sigma) {
rnorm(n,sd=sigma)*(1 + rbinom(n, 1, p)*(c-1))
}
data <- noise(n=10000, p=0.1, c=10, sigma=1)
hist(data, prob=TRUE, density=50, breaks=50)
density(data)
Call:
density.default(x = data)
Data: data (10000 obs.); Bandwidth 'bw' = 0.1611
x y
Min. :-31.85 Min. :0.0000
1st Qu.:-15.36 1st Qu.:0.0002
Median : 1.12 Median :0.0010
Mean : 1.12 Mean :0.0152
3rd Qu.: 17.61 3rd Qu.:0.0026
Max. : 34.09 Max. :0.3523
plot(density(data))
qqnorm(data)
# Indeed it is non-normal noise.
# ============
# Question two
# ------------
dataset <- function(n, beta, p=0.1, c=10, sigma=1) {
x <- runif(n=n, 0, 1)
y <- x*beta + noise(n=n, p=p, c=c, sigma=sigma)
data.frame(x=x, y=y)
}
for(i in 1:5) {
data <- dataset(n=20, beta=1)
plot(data)
}
# We can't see much from such small plots, but it seems like our
# generation process is correct.
# ==============
# Question three
# --------------
leastsquares <- function(n=20, beta=1, c=2) {
b_hat <- rep(NULL, 1000)
sd_hat <- rep(NULL, 1000)
for(i in 1:1000) {
data <- dataset(n=n, beta=beta, c=c)
b_hat[i] <- sum(data$x*data$y)/sum(data$x^2)
#assume e is iid AROUND ZERO: (otherwise use sd)
sd_hat[i] <- sqrt(mean((data$y-data$x*b_hat[i])^2))
}
data.frame(b_hat=b_hat, sd_hat=sd_hat)
}
estimates <- leastsquares(n=20, beta=1, c=2)
hist(estimates$b_hat)
plot(density(estimates$b_hat))
hist(estimates$sd_hat)
plot(density(estimates$sd_hat))
# It looks good after correction that we want e_i to be around 0.
# ==============
# Question four
# --------------
sapply(c(2,5,10), function(c) {
e <- leastsquares(n=20, beta=1, c=c)
data.frame(c = c,
b_hat = mean(e$b_hat),
b_hat_sd = sd(e$b_hat),
sd_hat = mean(e$sd_hat),
sd_hat_sd = sd(e$sd_hat))
})
[,1] [,2] [,3]
c 2 5 10
b_hat 1.03 1.024 1.001
b_hat_sd 0.4578 0.729 1.315
sd_hat 1.096 1.658 2.871
sd_hat_sd 0.2197 0.6936 1.645
# ==============
# Question five
# --------------
quant <- function(n=20, beta=1, c=2) {
b_hat <- rep(NULL, 1000)
sd_hat <- rep(NULL, 1000)
for(i in 1:1000) {
data <- dataset(n=n, beta=beta, c=c)
# again we assume e is iid AROUND ZERO:
# (otherwise change 0 to 1 and use sd)
model <- rq(y ~ x + 0, data=data)
b_hat[i] <- model$coefficients[[1]]
sd_hat[i] <- sqrt(mean((model$residuals)^2))
}
data.frame(b_hat=b_hat, sd_hat=sd_hat)
}
# Now calculate for various c values:
sapply(c(2,5,10), function(c) {
e <- quant(n=20, beta=1, c=c)
data.frame(c = c,
b_hat = mean(e$b_hat),
b_hat_sd = sd(e$b_hat))
})
[,1] [,2] [,3]
c 2 5 10
b_hat 1.018 1.011 0.9688
b_hat_sd 0.5115 0.5335 0.5653
# What we see here is that for the least squares method, the sd of
# the b_hat estimates goes up as we increase c, and it stays fairly
# constant for the quantile method. This is because higher c generates
# linearly higher noise, and that punishes the least squares
# method quadratically.
# ============
# Question six
# ------------
data <- dataset(n=100000, p=0, beta=1, sigma=1)
model <- rq(y ~ x + 0, data=data)
b_hat <- model$coefficients[[1]]
s <- mean(abs(data$y - b_hat*data$x))
s
[1] 0.7991
# It seems biased indeed; for these parameters it's ~80% of true SD
# (after trying it multiple times)
# ==============
# Question seven
# --------------
# Type I error, so rejection when null hypothesis is true, so B=0
# This is for the case with outliers:
rejections = rep(NULL, 10000)
for(i in 1:10000) {
data <- dataset(n=20, p=0.1, beta=0, sigma=1)
model <- rq(y ~ x + 0, data=data)
b_hat <- model$coefficients[[1]]
s <- mean(abs(data$y - b_hat*data$x))
w <- data$x/sum(data$x^2)
rejections[i] <- b_hat > 2*s*sqrt(sum(w^2))
}
mean(rejections) # with outliers
[1] 0.042
# We can repeat this for the test without outliers, if that's meant:
# (by setting p to 0)
rejections = rep(NULL, 10000)
for(i in 1:10000) {
data <- dataset(n=20, p=0, beta=0, sigma=1)
model <- rq(y ~ x + 0, data=data)
b_hat <- model$coefficients[[1]]
s <- mean(abs(data$y - b_hat*data$x))
w <- data$x/sum(data$x^2)
rejections[i] <- b_hat > 2*s*sqrt(sum(w^2))
}
mean(rejections) # without outliers
[1] 0.1082
# ==============
# Question eight
# --------------
# Here I will assume that it is for the data with outliers;
# to do it without, change the p=0.1 to p=0.
type_II <- function(beta) {
rejections = rep(NULL, 1000)
for(i in 1:1000) {
data <- dataset(n=20, p=0.1, beta=beta, sigma=1)
model <- rq(y ~ x + 0, data=data)
b_hat <- model$coefficients[[1]]
s <- mean(abs(data$y - b_hat*data$x))
w <- data$x/sum(data$x^2)
rejections[i] <- b_hat > 2*s*sqrt(sum(w^2))
}
1-rejections
}
sapply(seq(0.2, 1, 0.1), function(beta) {
data.frame(beta = beta,
power = mean(1-type_II(beta=beta)))
})
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
beta 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9 1
power 0.075 0.113 0.133 0.187 0.206 0.272 0.341 0.41 0.461