Question 2

2(a)

NVDA <- read_csv("C:/Users/david/Downloads/NVDA.csv")
## Rows: 3539485 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (6): EX, SYM_ROOT, TR_SCOND, TR_CORR, TR_SOURCE, TR_RF
## dbl  (4): SIZE, PRICE, TR_SEQNUM, TR_ID
## lgl  (2): SYM_SUFFIX, TR_STOP_IND
## date (1): DATE
## time (1): TIME_M
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
NVDA$timestamp <- as.POSIXct(NVDA$DATE) +
  as.numeric(NVDA$TIME_M)

NVDA_sample <- NVDA |>
  group_by(NVDA$timestamp) |>
  slice_sample(n = 1) |>
  ungroup()

ggplot(NVDA_sample, aes(TIME_M, PRICE)) + 
  geom_line() + 
  xlab("Time") +
  ylab("Price")

2(b)

To choose the threshold, I set thresh equal to the 0.99 empirical quantile of \(|P_t - \mathrm{runmed}(P_t, 3)|\), so that only about 1% of the most extreme deviations are flagged as outliers.

hf_data_clean <- function(x, k, thresh) {
  rm <- runmed(x, k)
  ind <- abs(x - rm) > thresh
  x[ind] <- rm[ind]
  x
}

rm  <- runmed(NVDA_sample$PRICE, 3)
abs_diff <- abs(NVDA_sample$PRICE - rm)

thresh <- quantile(abs_diff, 0.99)

NVDA_sample <- NVDA_sample |>
  mutate(PRICE_clean = hf_data_clean(PRICE, 3, thresh))

NVDA_sample |>
  dplyr::select(TIME_M, PRICE, PRICE_clean) |>
  pivot_longer(c(PRICE, PRICE_clean)) |>
  ggplot(aes(x = TIME_M, y = value, col = name)) +
  geom_line()

ggplot(NVDA_sample, aes(TIME_M, PRICE_clean)) + 
  geom_line() + 
  xlab("Time") +
  ylab("Price")

2(c)

For Question 2(c), I computed the quadratic variation for subsampling intervals \(d = 1,\dots,15\) using quad_var and plotted qv against \(d\). The estimated quadratic variation is highest for small \(d\) and decreases as \(d\) increases, rather than remaining flat. This pattern suggests that my data exhibit microstructure noise.

quad_var <- function(x, max_lag){
  d <- 1:max_lag
  qv <- rep(0, max_lag)
  for (i in 1:max_lag)
    qv[i] <- sum(diff(x, lag = i)^2/i)
  tibble(d = d, qv = qv)
}

quad_var(NVDA_sample$PRICE_clean, 15) |>
  ggplot(aes(x = d, y = qv)) +
  geom_point()

Question 3

3(a)

ARCH1_sim <- function(a0, a1, n){
  if(a0<=0)
    stop("a0 must be > 0")
  if(a1<0 || a1>=1)
    stop("0 <= a1 < 1 must be satisfied")
  
  x <- 1:n
  sigma2 <- 1:n
  epsilon <- rnorm(n)
  
  sigma2[1] <- a0/(1-a1)
  x[1] <- sqrt(sigma2[1])*epsilon[1]
  
  for(i in 2:n){
    sigma2[i] <- a0 + a1*x[i-1]^2
    x[i] <- sqrt(sigma2[i])*epsilon[i]
  }
  x
}

3(b)

For Question 3(b), I used the function ARCH1_sim from part (a) to simulate three ARCH(1) sample paths of length \(n = 10,000\) (with set.seed(1234)).

The three models correspond to the following parameter values: \[ (a_0, a_1) = (0.1, 0.2),\quad (0.1, 0.9),\quad (0.3, 0.9). \]

set.seed(1234)
n <- 10000

x1 <- ARCH1_sim(0.1, 0.2, n)
x2 <- ARCH1_sim(0.1, 0.9, n)
x3 <- ARCH1_sim(0.3, 0.9, n)

par(mfrow=c(3,1))
ts.plot(x1, main = "ARCH(1): a0=0.1, a1=0.2")
ts.plot(x2, main = "ARCH(1): a0=0.1, a1=0.9")
ts.plot(x3, main = "ARCH(1): a0=0.3, a1=0.9")

Question 5

ARCH1_ls <- function(x){
  y <- x[-1]^2
  z <- x[1:(length(x)-1)]^2
  ybar <- mean(y)
  zbar <- mean(z)
  
  a1_hat <- sum((y-ybar)*(z-zbar))/sum((z-zbar)^2)
  a0_hat <- ybar - a1_hat*zbar
  c(a0 = a0_hat, a1 = a1_hat)
}

set.seed(1234)
R  <- 1000
n  <- 500
a0 <- 1
a1 <- 0.6

ls_est  <- matrix(NA, R, 2)
mle_est <- matrix(NA, R, 2)
colnames(ls_est) <- c("a0", "a1")
colnames(mle_est) <- c("a0", "a1")

for (r in 1:R) {
  x <- ARCH1_sim(a0, a1, n)
  ls_est[r, ] <- ARCH1_ls(x)
  
  fit <- garch(x, order = c(0, 1), trace = FALSE)
  coefficient  <- coef(fit)
  mle_est[r, ] <- coefficient[c("a0", "a1")]
}

# MSE
ls_mse <- colMeans( (ls_est  - matrix(c(a0, a1), R, 2, byrow = TRUE))^2 )
mle_mse<- colMeans( (mle_est - matrix(c(a0, a1), R, 2, byrow = TRUE))^2 )

ls_mse
##         a0         a1 
## 0.51848819 0.03983174
mle_mse
##          a0          a1 
## 0.010840590 0.009222524

From the simulation above, the MLE is better than the LS method for both parameters.