orats clean - pull raw data

# raw data 
orats_core <- readRDS(file = "../data/orats_core.rds")

# sample
head(orats_core, 3)

orats clean - selecting variables, renaming & type formatting

# formatting the column names to lower-case
# selecting relevant data
# formatting date to be in date format
renamed_orats_core <- orats_core %>%
  rename_all(~tolower(.)) %>%
  select(ticker, 
         "date" = tradedate, 
         "price" = pxatmiv, 
         "sector" = sectorname, 
         avgoptvolu20d, atmivm1, dtexm1, atmivm2, dtexm2, atmivm3, dtexm3, atmivm4, dtexm4) %>%
  mutate(date = as.Date(date))

# sample
head(renamed_orats_core, 3)

orats clean - liquidity filters, removing useless data, number formatting, relabeling categorical values

# checking for any bad data in the numbers
# ie. there are tickers with price & volatility values at 0
renamed_orats_core %>%
  summary()
    ticker               date                price             sector          avgoptvolu20d        atmivm1            dtexm1         atmivm2            dtexm2      
 Length:5875        Min.   :2022-10-03   Min.   :    0.00   Length:5875        Min.   :      0   Min.   :   0.00   Min.   : 0.00   Min.   :   0.00   Min.   :  0.00  
 Class :character   1st Qu.:2022-10-03   1st Qu.:    7.10   Class :character   1st Qu.:     68   1st Qu.:  37.87   1st Qu.:19.00   1st Qu.:  36.30   1st Qu.: 47.00  
 Mode  :character   Median :2022-10-03   Median :   22.24   Mode  :character   Median :    276   Median :  52.94   Median :19.00   Median :  51.73   Median : 47.00  
                    Mean   :2022-10-03   Mean   :   47.02                      Mean   :   8086   Mean   :  63.22   Mean   :18.86   Mean   :  61.37   Mean   : 46.64  
                    3rd Qu.:2022-10-03   3rd Qu.:   49.92                      3rd Qu.:   1354   3rd Qu.:  77.36   3rd Qu.:19.00   3rd Qu.:  75.43   3rd Qu.: 47.00  
                    Max.   :2022-10-03   Max.   :11254.70                      Max.   :8375659   Max.   :1000.00   Max.   :75.00   Max.   :1000.00   Max.   :166.00  
    atmivm3            dtexm3         atmivm4            dtexm4     
 Min.   :   0.00   Min.   :  0.0   Min.   :   0.00   Min.   :  0.0  
 1st Qu.:  32.63   1st Qu.: 75.0   1st Qu.:  31.14   1st Qu.:138.0  
 Median :  46.92   Median :110.0   Median :  45.24   Median :166.0  
 Mean   :  56.45   Mean   : 97.9   Mean   :  54.57   Mean   :172.2  
 3rd Qu.:  70.68   3rd Qu.:110.0   3rd Qu.:  68.13   3rd Qu.:201.0  
 Max.   :1000.00   Max.   :229.0   Max.   :1000.00   Max.   :257.0  
# let's see if they rename after filtering for liquidity
# they do, going to take a closer look at these tickers before filtering out values = 0
renamed_orats_core %>%
  filter(price >= 15 & avgoptvolu20d >= 2000) %>%
  summary()
    ticker               date                price             sector          avgoptvolu20d        atmivm1           dtexm1         atmivm2           dtexm2     
 Length:845         Min.   :2022-10-03   Min.   :   15.00   Length:845         Min.   :   2005   Min.   :  0.00   Min.   : 0.00   Min.   :  0.00   Min.   : 0.00  
 Class :character   1st Qu.:2022-10-03   1st Qu.:   29.99   Class :character   1st Qu.:   3436   1st Qu.: 33.68   1st Qu.:19.00   1st Qu.: 34.43   1st Qu.:47.00  
 Mode  :character   Median :2022-10-03   Median :   57.20   Mode  :character   Median :   6393   Median : 45.45   Median :19.00   Median : 46.16   Median :47.00  
                    Mean   :2022-10-03   Mean   :  112.45                      Mean   :  47547   Mean   : 50.46   Mean   :18.98   Mean   : 51.40   Mean   :46.94  
                    3rd Qu.:2022-10-03   3rd Qu.:  112.80                      3rd Qu.:  18987   3rd Qu.: 62.89   3rd Qu.:19.00   3rd Qu.: 64.12   3rd Qu.:47.00  
                    Max.   :2022-10-03   Max.   :11254.70                      Max.   :8375659   Max.   :309.60   Max.   :19.00   Max.   :232.52   Max.   :47.00  
    atmivm3           dtexm3          atmivm4           dtexm4     
 Min.   :  0.00   Min.   :  0.00   Min.   :  0.00   Min.   :  0.0  
 1st Qu.: 32.70   1st Qu.: 75.00   1st Qu.: 32.16   1st Qu.:110.0  
 Median : 43.72   Median : 75.00   Median : 42.79   Median :110.0  
 Mean   : 49.46   Mean   : 88.07   Mean   : 48.17   Mean   :135.8  
 3rd Qu.: 61.39   3rd Qu.:110.00   3rd Qu.: 59.99   3rd Qu.:142.0  
 Max.   :287.18   Max.   :138.00   Max.   :206.12   Max.   :229.0  
# seems it's just one occurrence $VIX - going to remove it later anyhow
renamed_orats_core %>%
  filter(price >= 15 & avgoptvolu20d >= 2000, 
         if_any(starts_with("atmivm"), function(x) x == 0))

# going to see what we have for sector data
# seems we have 35 categories, there is one (the most frequent) category that has no value for the sector name
renamed_orats_core %>%
  filter(price >= 15 & avgoptvolu20d >= 2000) %>%
  count(sector) %>%
  arrange(desc(n))

# upon closer inspecting, seems to be etfs/indicies - going to relabel this category to be "None"
renamed_orats_core %>%
  filter(price >= 15 & avgoptvolu20d >= 2000, sector == "")

# relabeling sector = "" to "None
# filtering out atmivm{x} = 0
# rounding numeric values to the 2nd digit - since the implied vol numbers seem to be x100
clean_orats_core <- renamed_orats_core %>%
  filter(price >= 15 & avgoptvolu20d >= 2000,
         if_any(starts_with("atmivm"), function(x) x > 0)) %>%
  mutate(sector = ifelse(sector == "", "None", sector),
         across(where(is.numeric), function(x) round(x, 2))) %>%
  arrange(ticker)

head(clean_orats_core, 3)

orats clean - closer inspection on different dtes per same monthly contract

# noticed one last thing, the dte for the front monthlies seems to differ
# going to take a closer look at this, since i want the dte's to be the same for x contract for relative value trading
# seems 
clean_orats_core %>%
  select(ticker, starts_with("dtexm")) %>%
  pivot_longer(dtexm1:dtexm4, names_to = "monthly_contract", values_to = "dte") %>%
  group_by(monthly_contract, dte) %>%
  count() %>%
  arrange(desc(n))

# visualization
clean_orats_core %>%
  select(ticker, starts_with("dtexm")) %>%
  pivot_longer(dtexm1:dtexm4, names_to = "monthly_contract", values_to = "dte") %>%
  ggplot(aes(monthly_contract, dte, fill = monthly_contract)) +
  geom_boxplot() +
  labs(title = "dte at dtexm{x} monthly contract",
       subtitle = "i would want the boxplots to display one value (ie. dtexm1 & dtemx2 = straight line)",
       x = "monthly contract", y = "days to expiration",
       caption = "seems the 3rd & 4th monthly contracts has a range of different dtes")

orats clean - saving data

saveRDS(clean_orats_core, "../data/clean_orats_core.rds")

price clean - helper function (yahoo api query)

# uses yahoo api - parameters are ticker, first_date, and last_date
# i've set it up to only return the date & adjusted close price since we'll be doing correlation analysis

# yahoo api rate seems to be 2k/hour, ~40k/day. since we are only pulling ~800 tickers, not going to set any sleep conditions
price_api <- function(ticker, first_date, last_date){
  
  request <- GET(sprintf(
    "https://query1.finance.yahoo.com/v7/finance/download/%s?period1=%s&period2=%s&interval=1d&events=history&includeAdjustedClose=true", 
    ticker, first_date, last_date))
  
  if(status_code(request) == 200){
    contents <- content(
      x = request,
      as = "parsed",
      type = "text/csv",
      encoding = "UTF-8",
      show_col_types = FALSE,
      col_select = c("date" = "Date", "adj_close" = "Adj Close"))
    
    if(nrow(contents) > 0){
      contents$ticker <- ticker
      return(contents[,c("ticker", "date", "adj_close")])
      
    } else {
      message(sprintf("%s returned no data.", ticker))
      return()
    }
  } else {
    message(sprintf("%s returned status code: %s. No data pulled.", ticker, status_code(request)))
    return()
  }
}

price clean - query data

# pulling last date from our orats option data, adding +1 because yahoo api pulls between dates
orats_date <- as.Date(clean_orats_core$date[1]) + 1

# saving parameter variables for price api function
tickers <- clean_orats_core$ticker
first_date <- as.numeric(as.POSIXct(as.character(orats_date - 365)))
last_date <- as.numeric(as.POSIXct(orats_date))

# foreach loop ran with parallel processing 
price_data <- foreach(var = 1:length(tickers), .combine = "bind_rows", .packages = "httr", .errorhandling = "pass") %dopar% {
  price_api(
    ticker = tickers[var],
    first_date = first_date,
    last_date = last_date)
}

head(price_data, 3)

price clean - checking how much tickers we pulled

# seems we only pulled 419/844 of our tickers
price_data %>%
  count(ticker) %>%
  arrange(n)

# looking at our leftover tickers, there are some with underscores in the name - yahoo doesn't recognize that
# going to pre-filter out those tickers (~35 occurrences)
# yahoo also doesn't give historical data for indices & require there be a prefix "^DJX" to the name - these won't be collected
leftover_tickers <- clean_orats_core %>%
  filter(!ticker %in% unique(price_data$ticker)) %>%
  filter(!str_detect(ticker, "_")) %>%
  pull(ticker)

leftover_tickers
[1] "DJX" "SPX" "XAU" "XSP"
# going to combine our price data tables and do some final cleaning
# the numbers doesn't seem bad (no 0's, etc)
clean_price_data <- price_data %>%
  arrange(ticker, date) %>%
  # summary()
  mutate(adj_close = round(adj_close, 2))

# sample
head(clean_price_data, 3)

price clean - save cleaned price data

---
title: "relative value exercise - data clean"
output: html_notebook
---

### orats clean - pull raw data
- 'raw' data is from orats/cores api

```{r}
# raw data 
orats_core <- readRDS(file = "../data/orats_core.rds")

# sample
head(orats_core, 3)
```

### orats clean - selecting variables, renaming & type formatting
- selected values are ticker, date, price, sector, avgoptvolu20d &
- atmivm{x} = these are the at-the-money implied volatilities for {x} monthly expiration
  - ie. atmivm1 shows the atm implied vol for the front monthly
- dtexm{x} = these are the corresponding dte's of the atmivm{x} values
  - ie. dtexm1 shows the dtes for the front monthly
- **only need today's option contract data for this exercise, so pretty flexible to use other option sources like polygon or a brokerage etc**

```{r}
# formatting the column names to lower-case
# selecting relevant data
# formatting date to be in date format
renamed_orats_core <- orats_core %>%
  rename_all(~tolower(.)) %>%
  select(ticker, 
         "date" = tradedate, 
         "price" = pxatmiv, 
         "sector" = sectorname, 
         avgoptvolu20d, atmivm1, dtexm1, atmivm2, dtexm2, atmivm3, dtexm3, atmivm4, dtexm4) %>%
  mutate(date = as.Date(date))

# sample
head(renamed_orats_core, 3)
```

### orats clean - liquidity filters, removing useless data, number formatting, relabeling categorical values
- our predefined liquidity filter for this exercise is going to be:
  - underlying price above $15
  - average option volume 20days above 2000

```{r fig.width=12}
# checking for any bad data in the numbers
# ie. there are tickers with price & volatility values at 0
renamed_orats_core %>%
  summary()

# let's see if they rename after filtering for liquidity
# they do, going to take a closer look at these tickers before filtering out values = 0
renamed_orats_core %>%
  filter(price >= 15 & avgoptvolu20d >= 2000) %>%
  summary()

# seems it's just one occurrence $VIX - going to remove it later anyhow
renamed_orats_core %>%
  filter(price >= 15 & avgoptvolu20d >= 2000, 
         if_any(starts_with("atmivm"), function(x) x == 0))

# going to see what we have for sector data
# seems we have 35 categories, there is one (the most frequent) category that has no value for the sector name
renamed_orats_core %>%
  filter(price >= 15 & avgoptvolu20d >= 2000) %>%
  count(sector) %>%
  arrange(desc(n))

# upon closer inspecting, seems to be etfs/indicies - going to relabel this category to be "None"
renamed_orats_core %>%
  filter(price >= 15 & avgoptvolu20d >= 2000, sector == "")

# relabeling sector = "" to "None
# filtering out atmivm{x} = 0
# rounding numeric values to the 2nd digit - since the implied vol numbers seem to be x100
clean_orats_core <- renamed_orats_core %>%
  filter(price >= 15 & avgoptvolu20d >= 2000,
         if_any(starts_with("atmivm"), function(x) x > 0)) %>%
  mutate(sector = ifelse(sector == "", "None", sector),
         across(where(is.numeric), function(x) round(x, 2))) %>%
  arrange(ticker)

head(clean_orats_core, 3)
```

### orats clean - closer inspection on different dtes per same monthly contract
- dte seems to range at farther data contracts 
- dont need to do anything about it now, but should be noted when comparing between tickers

```{r}
# noticed one last thing, the dte for the front monthlies seems to differ
# going to take a closer look at this, since i want the dte's to be the same for x contract for relative value trading
# seems 
clean_orats_core %>%
  select(ticker, starts_with("dtexm")) %>%
  pivot_longer(dtexm1:dtexm4, names_to = "monthly_contract", values_to = "dte") %>%
  group_by(monthly_contract, dte) %>%
  count() %>%
  arrange(desc(n))

# visualization
clean_orats_core %>%
  select(ticker, starts_with("dtexm")) %>%
  pivot_longer(dtexm1:dtexm4, names_to = "monthly_contract", values_to = "dte") %>%
  ggplot(aes(monthly_contract, dte, fill = monthly_contract)) +
  geom_boxplot() +
  labs(title = "dte at dtexm{x} monthly contract",
       subtitle = "i would want the boxplots to display one value (ie. dtexm1 & dtemx2 = straight line)",
       x = "monthly contract", y = "days to expiration",
       caption = "seems the 3rd & 4th monthly contracts has a range of different dtes")
```

### orats clean - saving data
- same data that gets loaded into the relative value markdown exercise (option_data)

```{r}
saveRDS(clean_orats_core, "../data/clean_orats_core.rds")
```

### price clean - helper function (yahoo api query)
- using yahoo api for price data (need adjusted prices)

```{r}
# uses yahoo api - parameters are ticker, first_date, and last_date
# i've set it up to only return the date & adjusted close price since we'll be doing correlation analysis

# yahoo api rate seems to be 2k/hour, ~40k/day. since we are only pulling ~800 tickers, not going to set any sleep conditions
price_api <- function(ticker, first_date, last_date){
  
  request <- GET(sprintf(
    "https://query1.finance.yahoo.com/v7/finance/download/%s?period1=%s&period2=%s&interval=1d&events=history&includeAdjustedClose=true", 
    ticker, first_date, last_date))
  
  if(status_code(request) == 200){
    contents <- content(
      x = request,
      as = "parsed",
      type = "text/csv",
      encoding = "UTF-8",
      show_col_types = FALSE,
      col_select = c("date" = "Date", "adj_close" = "Adj Close"))
    
    if(nrow(contents) > 0){
      contents$ticker <- ticker
      return(contents[,c("ticker", "date", "adj_close")])
      
    } else {
      message(sprintf("%s returned no data.", ticker))
      return()
    }
  } else {
    message(sprintf("%s returned status code: %s. No data pulled.", ticker, status_code(request)))
    return()
  }
}
```

### price clean - query data

```{r}
# pulling last date from our orats option data, adding +1 because yahoo api pulls between dates
orats_date <- as.Date(clean_orats_core$date[1]) + 1

# saving parameter variables for price api function
tickers <- clean_orats_core$ticker
first_date <- as.numeric(as.POSIXct(as.character(orats_date - 365)))
last_date <- as.numeric(as.POSIXct(orats_date))

# foreach loop ran with parallel processing 
price_data <- foreach(var = 1:length(tickers), .combine = "bind_rows", .packages = "httr", .errorhandling = "pass") %dopar% {
  price_api(
    ticker = tickers[var],
    first_date = first_date,
    last_date = last_date)
}

head(price_data, 3)
```

### price clean - checking how much tickers we pulled
- ended up missing some tickers, gathered remaining & requeried
- then cleaned final price data

```{r}
# seems we only pulled 419/844 of our tickers
price_data %>%
  count(ticker) %>%
  arrange(n)

# looking at our leftover tickers, there are some with underscores in the name - yahoo doesn't recognize that
# going to pre-filter out those tickers (~35 occurrences)
# yahoo also doesn't give historical data for indices & require there be a prefix "^DJX" to the name - these won't be collected
leftover_tickers <- clean_orats_core %>%
  filter(!ticker %in% unique(price_data$ticker)) %>%
  filter(!str_detect(ticker, "_")) %>%
  pull(ticker)

leftover_tickers

# going to combine our price data tables and do some final cleaning
# the numbers doesn't seem bad (no 0's, etc)
clean_price_data <- price_data %>%
  arrange(ticker, date) %>%
  # summary()
  mutate(adj_close = round(adj_close, 2))

# sample
head(clean_price_data, 3)
```

### price clean - save cleaned price data
- same table used in relative value exercise (price_data)

```{r}
saveRDS(object = clean_price_data, file = "../data/clean_price_data.rds")
```


























