StatComp2.r

user2 — Sep 23, 2013, 3:56 AM

require("quantreg")

Loading required package: quantreg Loading required package: SparseM

Attaching package: 'SparseM'

The following object is masked from 'package:base':

backsolve


# ============
# Question one
# ------------
noise <- function(n, p, c, sigma) {
  rnorm(n,sd=sigma)*(1 + rbinom(n, 1, p)*(c-1))
}
data <- noise(n=10000, p=0.1, c=10, sigma=1)
hist(data, prob=TRUE, density=50, breaks=50)

plot of chunk unnamed-chunk-1

density(data)


Call:
    density.default(x = data)

Data: data (10000 obs.);    Bandwidth 'bw' = 0.1611

       x                y         
 Min.   :-31.85   Min.   :0.0000  
 1st Qu.:-15.36   1st Qu.:0.0002  
 Median :  1.12   Median :0.0010  
 Mean   :  1.12   Mean   :0.0152  
 3rd Qu.: 17.61   3rd Qu.:0.0026  
 Max.   : 34.09   Max.   :0.3523

plot(density(data))

plot of chunk unnamed-chunk-1

qqnorm(data)

plot of chunk unnamed-chunk-1

# Indeed it is non-normal noise.

# ============
# Question two
# ------------
dataset <- function(n, beta, p=0.1, c=10, sigma=1) {
  x <- runif(n=n, 0, 1)
  y <- x*beta + noise(n=n, p=p, c=c, sigma=sigma)
  data.frame(x=x, y=y)
}
for(i in 1:5) {
  data <- dataset(n=20, beta=1)
  plot(data)
}

plot of chunk unnamed-chunk-1

# We can't see much from such small plots, but it seems like our 
# generation process is correct.

# ==============
# Question three
# --------------
leastsquares <- function(n=20, beta=1, c=2) {
  b_hat <- rep(NULL, 1000)
  sd_hat <- rep(NULL, 1000)
  for(i in 1:1000) {
    data <- dataset(n=n, beta=beta, c=c)
    b_hat[i] <- sum(data$x*data$y)/sum(data$x^2)
    #assume e is iid AROUND ZERO: (otherwise use sd)
    sd_hat[i] <- sqrt(mean((data$y-data$x*b_hat[i])^2))
  }
  data.frame(b_hat=b_hat, sd_hat=sd_hat)
}
estimates <- leastsquares(n=20, beta=1, c=2)
hist(estimates$b_hat)

plot of chunk unnamed-chunk-1

plot(density(estimates$b_hat))

plot of chunk unnamed-chunk-1

hist(estimates$sd_hat)

plot of chunk unnamed-chunk-1

plot(density(estimates$sd_hat))

plot of chunk unnamed-chunk-1

# It looks good after correction that we want e_i to be around 0.


# ==============
# Question four
# --------------
sapply(c(2,5,10), function(c) {
  e <- leastsquares(n=20, beta=1, c=c)
  data.frame(c = c, 
             b_hat = mean(e$b_hat), 
             b_hat_sd = sd(e$b_hat),
             sd_hat = mean(e$sd_hat),
             sd_hat_sd = sd(e$sd_hat))
})

          [,1]   [,2]   [,3] 
c         2      5      10   
b_hat     1.03   1.024  1.001
b_hat_sd  0.4578 0.729  1.315
sd_hat    1.096  1.658  2.871
sd_hat_sd 0.2197 0.6936 1.645


# ==============
# Question five
# --------------
quant <- function(n=20, beta=1, c=2) {
  b_hat <- rep(NULL, 1000)
  sd_hat <- rep(NULL, 1000)
  for(i in 1:1000) {
    data <- dataset(n=n, beta=beta, c=c)
    # again we assume e is iid AROUND ZERO: 
    # (otherwise change 0 to 1 and use sd)
    model <- rq(y ~ x + 0, data=data)
    b_hat[i] <- model$coefficients[[1]]
    sd_hat[i] <- sqrt(mean((model$residuals)^2))

  }
  data.frame(b_hat=b_hat, sd_hat=sd_hat)
}

# Now calculate for various c values:
sapply(c(2,5,10), function(c) {
  e <- quant(n=20, beta=1, c=c)
  data.frame(c = c,
             b_hat = mean(e$b_hat),
             b_hat_sd = sd(e$b_hat))
})

         [,1]   [,2]   [,3]  
c        2      5      10    
b_hat    1.018  1.011  0.9688
b_hat_sd 0.5115 0.5335 0.5653



# What we see here is that for the least squares method, the sd of 
# the b_hat estimates goes up as we increase c, and it stays fairly 
# constant for the quantile method. This is because higher c generates
# linearly higher noise, and that punishes the least squares 
# method quadratically.

# ============
# Question six
# ------------
data <- dataset(n=100000, p=0, beta=1, sigma=1)
model <- rq(y ~ x + 0, data=data)
b_hat <- model$coefficients[[1]]
s <- mean(abs(data$y - b_hat*data$x))
s

[1] 0.7991

# It seems biased indeed; for these parameters it's ~80% of true SD
# (after trying it multiple times)

# ==============
# Question seven
# --------------
# Type I error, so rejection when null hypothesis is true, so B=0
# This is for the case with outliers:
rejections = rep(NULL, 10000)
for(i in 1:10000) {
  data <- dataset(n=20, p=0.1, beta=0, sigma=1)
  model <- rq(y ~ x + 0, data=data)
  b_hat <- model$coefficients[[1]]
  s <- mean(abs(data$y - b_hat*data$x))
  w <- data$x/sum(data$x^2)
  rejections[i] <- b_hat > 2*s*sqrt(sum(w^2))
}
mean(rejections) # with outliers

[1] 0.042



# We can repeat this for the test without outliers, if that's meant:
# (by setting p to 0)
rejections = rep(NULL, 10000)
for(i in 1:10000) {
  data <- dataset(n=20, p=0, beta=0, sigma=1)
  model <- rq(y ~ x + 0, data=data)
  b_hat <- model$coefficients[[1]]
  s <- mean(abs(data$y - b_hat*data$x))
  w <- data$x/sum(data$x^2)
  rejections[i] <- b_hat > 2*s*sqrt(sum(w^2))
}
mean(rejections) # without outliers

[1] 0.1082


# ==============
# Question eight
# --------------
# Here I will assume that it is for the data with outliers;
# to do it without, change the p=0.1 to p=0.

type_II <- function(beta) {
  rejections = rep(NULL, 1000)
  for(i in 1:1000) {
    data <- dataset(n=20, p=0.1, beta=beta, sigma=1)
    model <- rq(y ~ x + 0, data=data)
    b_hat <- model$coefficients[[1]]
    s <- mean(abs(data$y - b_hat*data$x))
    w <- data$x/sum(data$x^2)
    rejections[i] <- b_hat > 2*s*sqrt(sum(w^2))
  }
  1-rejections
}
sapply(seq(0.2, 1, 0.1), function(beta) {
  data.frame(beta = beta, 
             power = mean(1-type_II(beta=beta)))
})

      [,1]  [,2]  [,3]  [,4]  [,5]  [,6]  [,7]  [,8] [,9] 
beta  0.2   0.3   0.4   0.5   0.6   0.7   0.8   0.9  1    
power 0.075 0.113 0.133 0.187 0.206 0.272 0.341 0.41 0.461