NVDA <- read_csv("C:/Users/david/Downloads/NVDA.csv")
## Rows: 3539485 Columns: 14
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): EX, SYM_ROOT, TR_SCOND, TR_CORR, TR_SOURCE, TR_RF
## dbl (4): SIZE, PRICE, TR_SEQNUM, TR_ID
## lgl (2): SYM_SUFFIX, TR_STOP_IND
## date (1): DATE
## time (1): TIME_M
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
NVDA$timestamp <- as.POSIXct(NVDA$DATE) +
as.numeric(NVDA$TIME_M)
NVDA_sample <- NVDA |>
group_by(NVDA$timestamp) |>
slice_sample(n = 1) |>
ungroup()
ggplot(NVDA_sample, aes(TIME_M, PRICE)) +
geom_line() +
xlab("Time") +
ylab("Price")
To choose the threshold, I set thresh equal to the 0.99
empirical quantile of \(|P_t -
\mathrm{runmed}(P_t, 3)|\), so that only about 1% of the most
extreme deviations are flagged as outliers.
hf_data_clean <- function(x, k, thresh) {
rm <- runmed(x, k)
ind <- abs(x - rm) > thresh
x[ind] <- rm[ind]
x
}
rm <- runmed(NVDA_sample$PRICE, 3)
abs_diff <- abs(NVDA_sample$PRICE - rm)
thresh <- quantile(abs_diff, 0.99)
NVDA_sample <- NVDA_sample |>
mutate(PRICE_clean = hf_data_clean(PRICE, 3, thresh))
NVDA_sample |>
dplyr::select(TIME_M, PRICE, PRICE_clean) |>
pivot_longer(c(PRICE, PRICE_clean)) |>
ggplot(aes(x = TIME_M, y = value, col = name)) +
geom_line()
ggplot(NVDA_sample, aes(TIME_M, PRICE_clean)) +
geom_line() +
xlab("Time") +
ylab("Price")
For Question 2(c), I computed the quadratic variation for subsampling
intervals \(d = 1,\dots,15\) using
quad_var and plotted qv against \(d\). The estimated quadratic variation is
highest for small \(d\) and decreases
as \(d\) increases, rather than
remaining flat. This pattern suggests that my data exhibit
microstructure noise.
quad_var <- function(x, max_lag){
d <- 1:max_lag
qv <- rep(0, max_lag)
for (i in 1:max_lag)
qv[i] <- sum(diff(x, lag = i)^2/i)
tibble(d = d, qv = qv)
}
quad_var(NVDA_sample$PRICE_clean, 15) |>
ggplot(aes(x = d, y = qv)) +
geom_point()
ARCH1_sim <- function(a0, a1, n){
if(a0<=0)
stop("a0 must be > 0")
if(a1<0 || a1>=1)
stop("0 <= a1 < 1 must be satisfied")
x <- 1:n
sigma2 <- 1:n
epsilon <- rnorm(n)
sigma2[1] <- a0/(1-a1)
x[1] <- sqrt(sigma2[1])*epsilon[1]
for(i in 2:n){
sigma2[i] <- a0 + a1*x[i-1]^2
x[i] <- sqrt(sigma2[i])*epsilon[i]
}
x
}
For Question 3(b), I used the function ARCH1_sim from
part (a) to simulate three ARCH(1) sample paths of length \(n = 10,000\) (with
set.seed(1234)).
The three models correspond to the following parameter values: \[ (a_0, a_1) = (0.1, 0.2),\quad (0.1, 0.9),\quad (0.3, 0.9). \]
set.seed(1234)
n <- 10000
x1 <- ARCH1_sim(0.1, 0.2, n)
x2 <- ARCH1_sim(0.1, 0.9, n)
x3 <- ARCH1_sim(0.3, 0.9, n)
par(mfrow=c(3,1))
ts.plot(x1, main = "ARCH(1): a0=0.1, a1=0.2")
ts.plot(x2, main = "ARCH(1): a0=0.1, a1=0.9")
ts.plot(x3, main = "ARCH(1): a0=0.3, a1=0.9")
ARCH1_ls <- function(x){
y <- x[-1]^2
z <- x[1:(length(x)-1)]^2
ybar <- mean(y)
zbar <- mean(z)
a1_hat <- sum((y-ybar)*(z-zbar))/sum((z-zbar)^2)
a0_hat <- ybar - a1_hat*zbar
c(a0 = a0_hat, a1 = a1_hat)
}
set.seed(1234)
R <- 1000
n <- 500
a0 <- 1
a1 <- 0.6
ls_est <- matrix(NA, R, 2)
mle_est <- matrix(NA, R, 2)
colnames(ls_est) <- c("a0", "a1")
colnames(mle_est) <- c("a0", "a1")
for (r in 1:R) {
x <- ARCH1_sim(a0, a1, n)
ls_est[r, ] <- ARCH1_ls(x)
fit <- garch(x, order = c(0, 1), trace = FALSE)
coefficient <- coef(fit)
mle_est[r, ] <- coefficient[c("a0", "a1")]
}
# MSE
ls_mse <- colMeans( (ls_est - matrix(c(a0, a1), R, 2, byrow = TRUE))^2 )
mle_mse<- colMeans( (mle_est - matrix(c(a0, a1), R, 2, byrow = TRUE))^2 )
ls_mse
## a0 a1
## 0.51848819 0.03983174
mle_mse
## a0 a1
## 0.010840590 0.009222524
From the simulation above, the MLE is better than the LS method for both parameters.